[llvm] 3e0a76b - [Codegen][LegalizeIntegerTypes] Improve shift through stack (#96151)

via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 23 02:45:53 PDT 2024


Author: futog
Date: 2024-09-23T11:45:43+02:00
New Revision: 3e0a76b1fd10d2f5f36d34a91b525c1d29685185

URL: https://github.com/llvm/llvm-project/commit/3e0a76b1fd10d2f5f36d34a91b525c1d29685185
DIFF: https://github.com/llvm/llvm-project/commit/3e0a76b1fd10d2f5f36d34a91b525c1d29685185.diff

LOG: [Codegen][LegalizeIntegerTypes] Improve shift through stack (#96151)

Minor improvement on cc39c3b17fb2598e20ca0854f9fe6d69169d85c7.

Use an aligned stack slot to store the shifted value.
Use the native register width as shifting unit, so the load of the
shift result is aligned.

If the shift amount is a multiple of the native register width, there is
no need to do a follow-up shift after the load. I added new tests for
these cases.

Co-authored-by: Gergely Futo <gergely.futo at hightec-rt.com>

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
    llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
    llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
    llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
    llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
    llvm/test/CodeGen/Mips/llvm-ir/shl.ll
    llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
    llvm/test/CodeGen/PowerPC/pr59074.ll
    llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll
    llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
    llvm/test/CodeGen/RISCV/shifts.ll
    llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
    llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
    llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
    llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
    llvm/test/CodeGen/X86/pr38539.ll
    llvm/test/CodeGen/X86/scheduler-backtracking.ll
    llvm/test/CodeGen/X86/shift-i128.ll
    llvm/test/CodeGen/X86/shift-i256.ll
    llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
    llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
    llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
    llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index c622b2abedeacf..ee9c95c8593766 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4608,14 +4608,23 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
   SDValue ShAmt = N->getOperand(1);
   EVT ShAmtVT = ShAmt.getValueType();
 
-  // This legalization is optimal when the shift is by a multiple of byte width,
-  //   %x * 8 <-> %x << 3   so 3 low bits should be be known zero.
-  bool ShiftByByteMultiple =
-      DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >= 3;
+  EVT LoadVT = VT;
+  do {
+    LoadVT = TLI.getTypeToTransformTo(*DAG.getContext(), LoadVT);
+  } while (!TLI.isTypeLegal(LoadVT));
+
+  const unsigned ShiftUnitInBits = LoadVT.getStoreSizeInBits();
+  assert(ShiftUnitInBits <= VT.getScalarSizeInBits());
+  assert(isPowerOf2_32(ShiftUnitInBits) &&
+         "Shifting unit is not a a power of two!");
+
+  const bool IsOneStepShift =
+      DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >=
+      Log2_32(ShiftUnitInBits);
 
   // If we can't do it as one step, we'll have two uses of shift amount,
   // and thus must freeze it.
-  if (!ShiftByByteMultiple)
+  if (!IsOneStepShift)
     ShAmt = DAG.getFreeze(ShAmt);
 
   unsigned VTBitWidth = VT.getScalarSizeInBits();
@@ -4629,10 +4638,9 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
 
   // Get a temporary stack slot 2x the width of our VT.
   // FIXME: reuse stack slots?
-  // FIXME: should we be more picky about alignment?
-  Align StackSlotAlignment(1);
-  SDValue StackPtr = DAG.CreateStackTemporary(
-      TypeSize::getFixed(StackSlotByteWidth), StackSlotAlignment);
+  Align StackAlign = DAG.getReducedAlign(StackSlotVT, /*UseABI=*/false);
+  SDValue StackPtr =
+      DAG.CreateStackTemporary(StackSlotVT.getStoreSize(), StackAlign);
   EVT PtrTy = StackPtr.getValueType();
   SDValue Ch = DAG.getEntryNode();
 
@@ -4652,15 +4660,22 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
     Init = DAG.getNode(ISD::BUILD_PAIR, dl, StackSlotVT, AllZeros, Shiftee);
   }
   // And spill it into the stack slot.
-  Ch = DAG.getStore(Ch, dl, Init, StackPtr, StackPtrInfo, StackSlotAlignment);
+  Ch = DAG.getStore(Ch, dl, Init, StackPtr, StackPtrInfo, StackAlign);
 
   // Now, compute the full-byte offset into stack slot from where we can load.
-  // We have shift amount, which is in bits, but in multiples of byte.
-  // So just divide by CHAR_BIT.
+  // We have shift amount, which is in bits. Offset should point to an aligned
+  // address.
   SDNodeFlags Flags;
-  if (ShiftByByteMultiple)
-    Flags.setExact(true);
-  SDValue ByteOffset = DAG.getNode(ISD::SRL, dl, ShAmtVT, ShAmt,
+  Flags.setExact(IsOneStepShift);
+  SDValue SrlTmp = DAG.getNode(
+      ISD::SRL, dl, ShAmtVT, ShAmt,
+      DAG.getConstant(Log2_32(ShiftUnitInBits), dl, ShAmtVT), Flags);
+  SDValue BitOffset =
+      DAG.getNode(ISD::SHL, dl, ShAmtVT, SrlTmp,
+                  DAG.getConstant(Log2_32(ShiftUnitInBits), dl, ShAmtVT));
+
+  Flags.setExact(true);
+  SDValue ByteOffset = DAG.getNode(ISD::SRL, dl, ShAmtVT, BitOffset,
                                    DAG.getConstant(3, dl, ShAmtVT), Flags);
   // And clamp it, because OOB load is an immediate UB,
   // while shift overflow would have *just* been poison.
@@ -4689,15 +4704,16 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
   AdjStackPtr = DAG.getMemBasePlusOffset(AdjStackPtr, ByteOffset, dl);
 
   // And load it! While the load is not legal, legalizing it is obvious.
-  SDValue Res = DAG.getLoad(
-      VT, dl, Ch, AdjStackPtr,
-      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), Align(1));
-  // We've performed the shift by a CHAR_BIT * [_ShAmt / CHAR_BIT_]
-
-  // If we may still have a less-than-CHAR_BIT to shift by, do so now.
-  if (!ShiftByByteMultiple) {
-    SDValue ShAmtRem = DAG.getNode(ISD::AND, dl, ShAmtVT, ShAmt,
-                                   DAG.getConstant(7, dl, ShAmtVT));
+  SDValue Res =
+      DAG.getLoad(VT, dl, Ch, AdjStackPtr,
+                  MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()),
+                  commonAlignment(StackAlign, LoadVT.getStoreSize()));
+
+  // If we may still have a remaining bits to shift by, do so now.
+  if (!IsOneStepShift) {
+    SDValue ShAmtRem =
+        DAG.getNode(ISD::AND, dl, ShAmtVT, ShAmt,
+                    DAG.getConstant(ShiftUnitInBits - 1, dl, ShAmtVT));
     Res = DAG.getNode(N->getOpcode(), dl, VT, Res, ShAmtRem);
   }
 

diff  --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
index e21015ad3db30c..b02788ab1b34c1 100644
--- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -186,10 +186,54 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    ldr q1, [x0]
 ; ALL-NEXT:    stp x9, x8, [sp, #16]
 ; ALL-NEXT:    mov x8, sp
-; ALL-NEXT:    and x9, x10, #0x1f
+; ALL-NEXT:    and x9, x10, #0x18
 ; ALL-NEXT:    str q1, [sp]
 ; ALL-NEXT:    add x8, x8, x9
+; ALL-NEXT:    lsl x9, x10, #3
 ; ALL-NEXT:    stp q0, q0, [sp, #32]
+; ALL-NEXT:    ldp x11, x10, [x8, #16]
+; ALL-NEXT:    mvn w13, w9
+; ALL-NEXT:    ldp x8, x12, [x8]
+; ALL-NEXT:    and x9, x9, #0x38
+; ALL-NEXT:    lsl x14, x10, #1
+; ALL-NEXT:    lsl x15, x11, #1
+; ALL-NEXT:    lsr x11, x11, x9
+; ALL-NEXT:    lsl x16, x12, #1
+; ALL-NEXT:    lsr x10, x10, x9
+; ALL-NEXT:    lsr x12, x12, x9
+; ALL-NEXT:    lsl x14, x14, x13
+; ALL-NEXT:    lsr x8, x8, x9
+; ALL-NEXT:    lsl x9, x16, x13
+; ALL-NEXT:    lsl x13, x15, x13
+; ALL-NEXT:    orr x11, x14, x11
+; ALL-NEXT:    orr x8, x9, x8
+; ALL-NEXT:    orr x9, x12, x13
+; ALL-NEXT:    stp x11, x10, [x2, #16]
+; ALL-NEXT:    stp x8, x9, [x2]
+; ALL-NEXT:    add sp, sp, #64
+; ALL-NEXT:    ret
+  %src = load i256, ptr %src.ptr, align 1
+  %byteOff = load i256, ptr %byteOff.ptr, align 1
+  %bitOff = shl i256 %byteOff, 3
+  %res = lshr i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: lshr_32bytes_dwordOff:
+; ALL:       // %bb.0:
+; ALL-NEXT:    sub sp, sp, #64
+; ALL-NEXT:    ldp x9, x8, [x0, #16]
+; ALL-NEXT:    movi v0.2d, #0000000000000000
+; ALL-NEXT:    ldr x10, [x1]
+; ALL-NEXT:    ldr q1, [x0]
+; ALL-NEXT:    stp x9, x8, [sp, #16]
+; ALL-NEXT:    ubfiz x8, x10, #3, #2
+; ALL-NEXT:    mov x9, sp
+; ALL-NEXT:    str q1, [sp]
+; ALL-NEXT:    stp q0, q0, [sp, #32]
+; ALL-NEXT:    add x8, x9, x8
 ; ALL-NEXT:    ldp x10, x9, [x8, #16]
 ; ALL-NEXT:    ldr q0, [x8]
 ; ALL-NEXT:    str q0, [x2]
@@ -197,12 +241,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
-  %byteOff = load i256, ptr %byteOff.ptr, align 1
-  %bitOff = shl i256 %byteOff, 3
+  %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+  %bitOff = shl i256 %dwordOff, 6
   %res = lshr i256 %src, %bitOff
   store i256 %res, ptr %dst, align 1
   ret void
 }
+
 define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: shl_32bytes:
 ; ALL:       // %bb.0:
@@ -213,11 +258,56 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    ldr q1, [x0]
 ; ALL-NEXT:    stp x9, x8, [sp, #48]
 ; ALL-NEXT:    mov x8, sp
-; ALL-NEXT:    and x9, x10, #0x1f
+; ALL-NEXT:    and x9, x10, #0x18
 ; ALL-NEXT:    add x8, x8, #32
 ; ALL-NEXT:    stp q0, q0, [sp]
 ; ALL-NEXT:    str q1, [sp, #32]
 ; ALL-NEXT:    sub x8, x8, x9
+; ALL-NEXT:    lsl x9, x10, #3
+; ALL-NEXT:    ldp x10, x11, [x8]
+; ALL-NEXT:    ldp x12, x8, [x8, #16]
+; ALL-NEXT:    mvn w13, w9
+; ALL-NEXT:    and x9, x9, #0x38
+; ALL-NEXT:    lsr x14, x10, #1
+; ALL-NEXT:    lsr x15, x11, #1
+; ALL-NEXT:    lsl x11, x11, x9
+; ALL-NEXT:    lsr x16, x12, #1
+; ALL-NEXT:    lsl x10, x10, x9
+; ALL-NEXT:    lsl x12, x12, x9
+; ALL-NEXT:    lsr x14, x14, x13
+; ALL-NEXT:    lsl x8, x8, x9
+; ALL-NEXT:    lsr x9, x16, x13
+; ALL-NEXT:    lsr x13, x15, x13
+; ALL-NEXT:    orr x11, x11, x14
+; ALL-NEXT:    orr x8, x8, x9
+; ALL-NEXT:    orr x9, x12, x13
+; ALL-NEXT:    stp x10, x11, [x2]
+; ALL-NEXT:    stp x9, x8, [x2, #16]
+; ALL-NEXT:    add sp, sp, #64
+; ALL-NEXT:    ret
+  %src = load i256, ptr %src.ptr, align 1
+  %byteOff = load i256, ptr %byteOff.ptr, align 1
+  %bitOff = shl i256 %byteOff, 3
+  %res = shl i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: shl_32bytes_dwordOff:
+; ALL:       // %bb.0:
+; ALL-NEXT:    sub sp, sp, #64
+; ALL-NEXT:    ldp x9, x8, [x0, #16]
+; ALL-NEXT:    movi v0.2d, #0000000000000000
+; ALL-NEXT:    ldr x10, [x1]
+; ALL-NEXT:    ldr q1, [x0]
+; ALL-NEXT:    stp x9, x8, [sp, #48]
+; ALL-NEXT:    mov x8, sp
+; ALL-NEXT:    ubfiz x9, x10, #3, #2
+; ALL-NEXT:    add x8, x8, #32
+; ALL-NEXT:    stp q0, q1, [sp, #16]
+; ALL-NEXT:    str q0, [sp]
+; ALL-NEXT:    sub x8, x8, x9
 ; ALL-NEXT:    ldp x9, x10, [x8, #16]
 ; ALL-NEXT:    ldr q0, [x8]
 ; ALL-NEXT:    str q0, [x2]
@@ -225,12 +315,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
-  %byteOff = load i256, ptr %byteOff.ptr, align 1
-  %bitOff = shl i256 %byteOff, 3
+  %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+  %bitOff = shl i256 %dwordOff, 6
   %res = shl i256 %src, %bitOff
   store i256 %res, ptr %dst, align 1
   ret void
 }
+
 define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: ashr_32bytes:
 ; ALL:       // %bb.0:
@@ -238,14 +329,59 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    ldp x9, x8, [x0, #16]
 ; ALL-NEXT:    ldr x10, [x1]
 ; ALL-NEXT:    ldr q0, [x0]
-; ALL-NEXT:    and x10, x10, #0x1f
+; ALL-NEXT:    and x11, x10, #0x18
 ; ALL-NEXT:    stp x9, x8, [sp, #16]
 ; ALL-NEXT:    asr x8, x8, #63
 ; ALL-NEXT:    mov x9, sp
 ; ALL-NEXT:    str q0, [sp]
+; ALL-NEXT:    add x9, x9, x11
+; ALL-NEXT:    stp x8, x8, [sp, #48]
+; ALL-NEXT:    stp x8, x8, [sp, #32]
+; ALL-NEXT:    lsl x8, x10, #3
+; ALL-NEXT:    ldp x11, x10, [x9, #16]
+; ALL-NEXT:    ldp x9, x12, [x9]
+; ALL-NEXT:    mvn w13, w8
+; ALL-NEXT:    and x8, x8, #0x38
+; ALL-NEXT:    lsl x14, x10, #1
+; ALL-NEXT:    lsl x15, x11, #1
+; ALL-NEXT:    lsr x11, x11, x8
+; ALL-NEXT:    lsl x16, x12, #1
+; ALL-NEXT:    asr x10, x10, x8
+; ALL-NEXT:    lsr x12, x12, x8
+; ALL-NEXT:    lsl x14, x14, x13
+; ALL-NEXT:    lsr x8, x9, x8
+; ALL-NEXT:    lsl x9, x16, x13
+; ALL-NEXT:    lsl x13, x15, x13
+; ALL-NEXT:    orr x11, x14, x11
+; ALL-NEXT:    orr x8, x9, x8
+; ALL-NEXT:    orr x9, x12, x13
+; ALL-NEXT:    stp x11, x10, [x2, #16]
+; ALL-NEXT:    stp x8, x9, [x2]
+; ALL-NEXT:    add sp, sp, #64
+; ALL-NEXT:    ret
+  %src = load i256, ptr %src.ptr, align 1
+  %byteOff = load i256, ptr %byteOff.ptr, align 1
+  %bitOff = shl i256 %byteOff, 3
+  %res = ashr i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: ashr_32bytes_dwordOff:
+; ALL:       // %bb.0:
+; ALL-NEXT:    sub sp, sp, #64
+; ALL-NEXT:    ldp x9, x8, [x0, #16]
+; ALL-NEXT:    ldr x10, [x1]
+; ALL-NEXT:    ldr q0, [x0]
+; ALL-NEXT:    stp x9, x8, [sp, #16]
+; ALL-NEXT:    asr x8, x8, #63
+; ALL-NEXT:    ubfiz x9, x10, #3, #2
+; ALL-NEXT:    mov x10, sp
+; ALL-NEXT:    str q0, [sp]
 ; ALL-NEXT:    stp x8, x8, [sp, #48]
 ; ALL-NEXT:    stp x8, x8, [sp, #32]
-; ALL-NEXT:    add x8, x9, x10
+; ALL-NEXT:    add x8, x10, x9
 ; ALL-NEXT:    ldp x10, x9, [x8, #16]
 ; ALL-NEXT:    ldr q0, [x8]
 ; ALL-NEXT:    str q0, [x2]
@@ -253,8 +389,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
-  %byteOff = load i256, ptr %byteOff.ptr, align 1
-  %bitOff = shl i256 %byteOff, 3
+  %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+  %bitOff = shl i256 %dwordOff, 6
   %res = ashr i256 %src, %bitOff
   store i256 %res, ptr %dst, align 1
   ret void

diff  --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
index a4da6db57ecae3..531e0fa740da78 100644
--- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
@@ -160,30 +160,33 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    ldr x10, [x1]
 ; ALL-NEXT:    ldr q1, [x0]
 ; ALL-NEXT:    stp x9, x8, [sp, #16]
-; ALL-NEXT:    ubfx x8, x10, #3, #5
+; ALL-NEXT:    lsr x8, x10, #3
 ; ALL-NEXT:    mov x9, sp
 ; ALL-NEXT:    str q1, [sp]
-; ALL-NEXT:    and x10, x10, #0x7
+; ALL-NEXT:    and x12, x10, #0x3f
+; ALL-NEXT:    and x8, x8, #0x18
 ; ALL-NEXT:    stp q0, q0, [sp, #32]
+; ALL-NEXT:    eor x12, x12, #0x3f
 ; ALL-NEXT:    add x8, x9, x8
-; ALL-NEXT:    mvn w13, w10
-; ALL-NEXT:    ldp x11, x9, [x8, #16]
-; ALL-NEXT:    ldp x8, x12, [x8]
+; ALL-NEXT:    ldp x13, x11, [x8]
+; ALL-NEXT:    ldr x9, [x8, #24]
+; ALL-NEXT:    ldr x8, [x8, #16]
 ; ALL-NEXT:    lsl x14, x9, #1
+; ALL-NEXT:    lsr x9, x9, x10
 ; ALL-NEXT:    lsl x15, x11, #1
 ; ALL-NEXT:    lsr x11, x11, x10
-; ALL-NEXT:    lsl x16, x12, #1
-; ALL-NEXT:    lsr x9, x9, x10
-; ALL-NEXT:    lsr x12, x12, x10
-; ALL-NEXT:    lsl x14, x14, x13
+; ALL-NEXT:    lsr x13, x13, x10
+; ALL-NEXT:    lsl x14, x14, x12
+; ALL-NEXT:    lsl x12, x15, x12
+; ALL-NEXT:    lsl x15, x8, #1
 ; ALL-NEXT:    lsr x8, x8, x10
-; ALL-NEXT:    lsl x10, x16, x13
-; ALL-NEXT:    lsl x13, x15, x13
-; ALL-NEXT:    orr x11, x14, x11
-; ALL-NEXT:    stp x11, x9, [x2, #16]
-; ALL-NEXT:    orr x8, x10, x8
+; ALL-NEXT:    mvn w10, w10
+; ALL-NEXT:    lsl x10, x15, x10
+; ALL-NEXT:    orr x8, x14, x8
+; ALL-NEXT:    stp x8, x9, [x2, #16]
 ; ALL-NEXT:    orr x9, x12, x13
-; ALL-NEXT:    stp x8, x9, [x2]
+; ALL-NEXT:    orr x8, x11, x10
+; ALL-NEXT:    stp x9, x8, [x2]
 ; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
@@ -201,31 +204,34 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    ldr x10, [x1]
 ; ALL-NEXT:    ldr q1, [x0]
 ; ALL-NEXT:    stp x9, x8, [sp, #48]
-; ALL-NEXT:    mov x8, sp
-; ALL-NEXT:    ubfx x9, x10, #3, #5
-; ALL-NEXT:    add x8, x8, #32
+; ALL-NEXT:    lsr x8, x10, #3
+; ALL-NEXT:    mov x9, sp
+; ALL-NEXT:    add x9, x9, #32
 ; ALL-NEXT:    stp q0, q1, [sp, #16]
-; ALL-NEXT:    and x10, x10, #0x7
+; ALL-NEXT:    and x12, x10, #0x3f
+; ALL-NEXT:    and x8, x8, #0x18
 ; ALL-NEXT:    str q0, [sp]
-; ALL-NEXT:    sub x8, x8, x9
-; ALL-NEXT:    mvn w13, w10
-; ALL-NEXT:    ldp x9, x11, [x8]
-; ALL-NEXT:    ldp x12, x8, [x8, #16]
-; ALL-NEXT:    lsr x14, x9, #1
-; ALL-NEXT:    lsr x15, x11, #1
-; ALL-NEXT:    lsl x11, x11, x10
-; ALL-NEXT:    lsr x16, x12, #1
+; ALL-NEXT:    eor x12, x12, #0x3f
+; ALL-NEXT:    sub x8, x9, x8
+; ALL-NEXT:    ldp x11, x13, [x8, #16]
+; ALL-NEXT:    ldr x9, [x8]
+; ALL-NEXT:    ldr x8, [x8, #8]
+; ALL-NEXT:    lsr x15, x9, #1
 ; ALL-NEXT:    lsl x9, x9, x10
-; ALL-NEXT:    lsl x12, x12, x10
-; ALL-NEXT:    lsr x14, x14, x13
+; ALL-NEXT:    lsr x14, x11, #1
+; ALL-NEXT:    lsl x11, x11, x10
+; ALL-NEXT:    lsl x13, x13, x10
+; ALL-NEXT:    lsr x14, x14, x12
+; ALL-NEXT:    lsr x12, x15, x12
+; ALL-NEXT:    lsr x15, x8, #1
 ; ALL-NEXT:    lsl x8, x8, x10
-; ALL-NEXT:    lsr x10, x16, x13
-; ALL-NEXT:    lsr x13, x15, x13
-; ALL-NEXT:    orr x11, x11, x14
-; ALL-NEXT:    stp x9, x11, [x2]
-; ALL-NEXT:    orr x8, x8, x10
-; ALL-NEXT:    orr x9, x12, x13
-; ALL-NEXT:    stp x9, x8, [x2, #16]
+; ALL-NEXT:    mvn w10, w10
+; ALL-NEXT:    lsr x10, x15, x10
+; ALL-NEXT:    orr x8, x8, x12
+; ALL-NEXT:    stp x9, x8, [x2]
+; ALL-NEXT:    orr x9, x13, x14
+; ALL-NEXT:    orr x8, x11, x10
+; ALL-NEXT:    stp x8, x9, [x2, #16]
 ; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
@@ -243,31 +249,34 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    ldr x10, [x1]
 ; ALL-NEXT:    ldr q0, [x0]
 ; ALL-NEXT:    stp x9, x8, [sp, #16]
+; ALL-NEXT:    lsr x9, x10, #3
 ; ALL-NEXT:    asr x8, x8, #63
-; ALL-NEXT:    ubfx x9, x10, #3, #5
 ; ALL-NEXT:    str q0, [sp]
-; ALL-NEXT:    and x10, x10, #0x7
+; ALL-NEXT:    and x12, x10, #0x3f
+; ALL-NEXT:    and x9, x9, #0x18
 ; ALL-NEXT:    stp x8, x8, [sp, #48]
-; ALL-NEXT:    add x9, x11, x9
-; ALL-NEXT:    mvn w13, w10
+; ALL-NEXT:    eor x12, x12, #0x3f
 ; ALL-NEXT:    stp x8, x8, [sp, #32]
-; ALL-NEXT:    ldp x11, x8, [x9, #16]
-; ALL-NEXT:    ldp x9, x12, [x9]
-; ALL-NEXT:    lsl x14, x8, #1
+; ALL-NEXT:    add x8, x11, x9
+; ALL-NEXT:    ldp x13, x11, [x8]
+; ALL-NEXT:    ldr x9, [x8, #24]
+; ALL-NEXT:    ldr x8, [x8, #16]
+; ALL-NEXT:    lsl x14, x9, #1
+; ALL-NEXT:    asr x9, x9, x10
 ; ALL-NEXT:    lsl x15, x11, #1
 ; ALL-NEXT:    lsr x11, x11, x10
-; ALL-NEXT:    lsl x16, x12, #1
-; ALL-NEXT:    asr x8, x8, x10
-; ALL-NEXT:    lsr x12, x12, x10
-; ALL-NEXT:    lsl x14, x14, x13
-; ALL-NEXT:    lsr x9, x9, x10
-; ALL-NEXT:    lsl x10, x16, x13
-; ALL-NEXT:    lsl x13, x15, x13
-; ALL-NEXT:    orr x11, x14, x11
-; ALL-NEXT:    stp x11, x8, [x2, #16]
-; ALL-NEXT:    orr x8, x10, x9
+; ALL-NEXT:    lsr x13, x13, x10
+; ALL-NEXT:    lsl x14, x14, x12
+; ALL-NEXT:    lsl x12, x15, x12
+; ALL-NEXT:    lsl x15, x8, #1
+; ALL-NEXT:    lsr x8, x8, x10
+; ALL-NEXT:    mvn w10, w10
+; ALL-NEXT:    lsl x10, x15, x10
+; ALL-NEXT:    orr x8, x14, x8
+; ALL-NEXT:    stp x8, x9, [x2, #16]
 ; ALL-NEXT:    orr x9, x12, x13
-; ALL-NEXT:    stp x8, x9, [x2]
+; ALL-NEXT:    orr x8, x11, x10
+; ALL-NEXT:    stp x9, x8, [x2]
 ; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1

diff  --git a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
index 450fe968d4917c..2b8129acb91fce 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
@@ -382,53 +382,40 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS:       # %bb.0: # %entry
 ; MIPS-NEXT:    addiu $sp, $sp, -32
 ; MIPS-NEXT:    .cfi_def_cfa_offset 32
-; MIPS-NEXT:    swl $7, 28($sp)
-; MIPS-NEXT:    swl $6, 24($sp)
 ; MIPS-NEXT:    sra $1, $4, 31
-; MIPS-NEXT:    swl $5, 20($sp)
-; MIPS-NEXT:    swl $4, 16($sp)
-; MIPS-NEXT:    swl $1, 12($sp)
-; MIPS-NEXT:    swl $1, 8($sp)
-; MIPS-NEXT:    swl $1, 4($sp)
-; MIPS-NEXT:    swl $1, 0($sp)
-; MIPS-NEXT:    addiu $2, $sp, 0
-; MIPS-NEXT:    swr $7, 31($sp)
-; MIPS-NEXT:    swr $6, 27($sp)
-; MIPS-NEXT:    swr $5, 23($sp)
-; MIPS-NEXT:    swr $4, 19($sp)
-; MIPS-NEXT:    swr $1, 15($sp)
-; MIPS-NEXT:    swr $1, 11($sp)
-; MIPS-NEXT:    swr $1, 7($sp)
-; MIPS-NEXT:    swr $1, 3($sp)
-; MIPS-NEXT:    addiu $1, $2, 16
+; MIPS-NEXT:    sw $7, 28($sp)
+; MIPS-NEXT:    sw $6, 24($sp)
+; MIPS-NEXT:    sw $5, 20($sp)
+; MIPS-NEXT:    sw $4, 16($sp)
+; MIPS-NEXT:    sw $1, 12($sp)
+; MIPS-NEXT:    sw $1, 8($sp)
+; MIPS-NEXT:    sw $1, 4($sp)
+; MIPS-NEXT:    sw $1, 0($sp)
+; MIPS-NEXT:    addiu $1, $sp, 0
+; MIPS-NEXT:    addiu $1, $1, 16
 ; MIPS-NEXT:    lw $2, 60($sp)
 ; MIPS-NEXT:    srl $3, $2, 3
-; MIPS-NEXT:    andi $3, $3, 15
+; MIPS-NEXT:    andi $3, $3, 12
 ; MIPS-NEXT:    subu $1, $1, $3
-; MIPS-NEXT:    lwl $3, 4($1)
-; MIPS-NEXT:    lwr $3, 7($1)
-; MIPS-NEXT:    sll $4, $3, 1
-; MIPS-NEXT:    lwl $5, 8($1)
-; MIPS-NEXT:    lwr $5, 11($1)
-; MIPS-NEXT:    andi $2, $2, 7
-; MIPS-NEXT:    not $6, $2
-; MIPS-NEXT:    srlv $7, $5, $2
-; MIPS-NEXT:    sllv $4, $4, $6
+; MIPS-NEXT:    lw $3, 4($1)
+; MIPS-NEXT:    lw $5, 8($1)
+; MIPS-NEXT:    srlv $4, $5, $2
+; MIPS-NEXT:    sll $6, $3, 1
+; MIPS-NEXT:    andi $7, $2, 31
+; MIPS-NEXT:    xori $7, $7, 31
+; MIPS-NEXT:    sllv $6, $6, $7
 ; MIPS-NEXT:    srlv $3, $3, $2
-; MIPS-NEXT:    lwl $6, 0($1)
-; MIPS-NEXT:    lwr $6, 3($1)
-; MIPS-NEXT:    sll $8, $6, 1
-; MIPS-NEXT:    xori $9, $2, 31
-; MIPS-NEXT:    sllv $8, $8, $9
-; MIPS-NEXT:    or $3, $3, $8
-; MIPS-NEXT:    or $4, $7, $4
-; MIPS-NEXT:    lwl $7, 12($1)
-; MIPS-NEXT:    lwr $7, 15($1)
-; MIPS-NEXT:    srlv $1, $7, $2
+; MIPS-NEXT:    lw $8, 0($1)
+; MIPS-NEXT:    sll $9, $8, 1
+; MIPS-NEXT:    sllv $9, $9, $7
+; MIPS-NEXT:    or $3, $3, $9
+; MIPS-NEXT:    or $4, $4, $6
+; MIPS-NEXT:    lw $1, 12($1)
+; MIPS-NEXT:    srlv $1, $1, $2
 ; MIPS-NEXT:    sll $5, $5, 1
-; MIPS-NEXT:    sllv $5, $5, $9
+; MIPS-NEXT:    sllv $5, $5, $7
 ; MIPS-NEXT:    or $5, $1, $5
-; MIPS-NEXT:    srav $2, $6, $2
+; MIPS-NEXT:    srav $2, $8, $2
 ; MIPS-NEXT:    jr $ra
 ; MIPS-NEXT:    addiu $sp, $sp, 32
 ;
@@ -436,53 +423,40 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    addiu $sp, $sp, -32
 ; MIPS32-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32-NEXT:    swl $7, 28($sp)
-; MIPS32-NEXT:    swl $6, 24($sp)
 ; MIPS32-NEXT:    sra $1, $4, 31
-; MIPS32-NEXT:    swl $5, 20($sp)
-; MIPS32-NEXT:    swl $4, 16($sp)
-; MIPS32-NEXT:    swl $1, 12($sp)
-; MIPS32-NEXT:    swl $1, 8($sp)
-; MIPS32-NEXT:    swl $1, 4($sp)
-; MIPS32-NEXT:    swl $1, 0($sp)
-; MIPS32-NEXT:    addiu $2, $sp, 0
-; MIPS32-NEXT:    swr $7, 31($sp)
-; MIPS32-NEXT:    swr $6, 27($sp)
-; MIPS32-NEXT:    swr $5, 23($sp)
-; MIPS32-NEXT:    swr $4, 19($sp)
-; MIPS32-NEXT:    swr $1, 15($sp)
-; MIPS32-NEXT:    swr $1, 11($sp)
-; MIPS32-NEXT:    swr $1, 7($sp)
-; MIPS32-NEXT:    swr $1, 3($sp)
-; MIPS32-NEXT:    addiu $1, $2, 16
+; MIPS32-NEXT:    sw $7, 28($sp)
+; MIPS32-NEXT:    sw $6, 24($sp)
+; MIPS32-NEXT:    sw $5, 20($sp)
+; MIPS32-NEXT:    sw $4, 16($sp)
+; MIPS32-NEXT:    sw $1, 12($sp)
+; MIPS32-NEXT:    sw $1, 8($sp)
+; MIPS32-NEXT:    sw $1, 4($sp)
+; MIPS32-NEXT:    sw $1, 0($sp)
+; MIPS32-NEXT:    addiu $1, $sp, 0
+; MIPS32-NEXT:    addiu $1, $1, 16
 ; MIPS32-NEXT:    lw $2, 60($sp)
 ; MIPS32-NEXT:    srl $3, $2, 3
-; MIPS32-NEXT:    andi $3, $3, 15
+; MIPS32-NEXT:    andi $3, $3, 12
 ; MIPS32-NEXT:    subu $1, $1, $3
-; MIPS32-NEXT:    lwl $3, 4($1)
-; MIPS32-NEXT:    lwr $3, 7($1)
-; MIPS32-NEXT:    sll $4, $3, 1
-; MIPS32-NEXT:    lwl $5, 8($1)
-; MIPS32-NEXT:    lwr $5, 11($1)
-; MIPS32-NEXT:    andi $2, $2, 7
-; MIPS32-NEXT:    not $6, $2
-; MIPS32-NEXT:    srlv $7, $5, $2
-; MIPS32-NEXT:    sllv $4, $4, $6
+; MIPS32-NEXT:    lw $3, 4($1)
+; MIPS32-NEXT:    lw $5, 8($1)
+; MIPS32-NEXT:    srlv $4, $5, $2
+; MIPS32-NEXT:    sll $6, $3, 1
+; MIPS32-NEXT:    andi $7, $2, 31
+; MIPS32-NEXT:    xori $7, $7, 31
+; MIPS32-NEXT:    sllv $6, $6, $7
 ; MIPS32-NEXT:    srlv $3, $3, $2
-; MIPS32-NEXT:    lwl $6, 0($1)
-; MIPS32-NEXT:    lwr $6, 3($1)
-; MIPS32-NEXT:    sll $8, $6, 1
-; MIPS32-NEXT:    xori $9, $2, 31
-; MIPS32-NEXT:    sllv $8, $8, $9
-; MIPS32-NEXT:    or $3, $3, $8
-; MIPS32-NEXT:    or $4, $7, $4
-; MIPS32-NEXT:    lwl $7, 12($1)
-; MIPS32-NEXT:    lwr $7, 15($1)
-; MIPS32-NEXT:    srlv $1, $7, $2
+; MIPS32-NEXT:    lw $8, 0($1)
+; MIPS32-NEXT:    sll $9, $8, 1
+; MIPS32-NEXT:    sllv $9, $9, $7
+; MIPS32-NEXT:    or $3, $3, $9
+; MIPS32-NEXT:    or $4, $4, $6
+; MIPS32-NEXT:    lw $1, 12($1)
+; MIPS32-NEXT:    srlv $1, $1, $2
 ; MIPS32-NEXT:    sll $5, $5, 1
-; MIPS32-NEXT:    sllv $5, $5, $9
+; MIPS32-NEXT:    sllv $5, $5, $7
 ; MIPS32-NEXT:    or $5, $1, $5
-; MIPS32-NEXT:    srav $2, $6, $2
+; MIPS32-NEXT:    srav $2, $8, $2
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    addiu $sp, $sp, 32
 ;
@@ -490,52 +464,40 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; 32R2:       # %bb.0: # %entry
 ; 32R2-NEXT:    addiu $sp, $sp, -32
 ; 32R2-NEXT:    .cfi_def_cfa_offset 32
-; 32R2-NEXT:    swl $7, 28($sp)
-; 32R2-NEXT:    swl $6, 24($sp)
-; 32R2-NEXT:    swl $5, 20($sp)
 ; 32R2-NEXT:    sra $1, $4, 31
-; 32R2-NEXT:    swl $4, 16($sp)
-; 32R2-NEXT:    swl $1, 12($sp)
-; 32R2-NEXT:    swl $1, 8($sp)
-; 32R2-NEXT:    swl $1, 4($sp)
-; 32R2-NEXT:    swl $1, 0($sp)
-; 32R2-NEXT:    swr $7, 31($sp)
-; 32R2-NEXT:    swr $6, 27($sp)
-; 32R2-NEXT:    swr $5, 23($sp)
-; 32R2-NEXT:    swr $4, 19($sp)
-; 32R2-NEXT:    swr $1, 15($sp)
-; 32R2-NEXT:    swr $1, 11($sp)
-; 32R2-NEXT:    swr $1, 7($sp)
-; 32R2-NEXT:    swr $1, 3($sp)
+; 32R2-NEXT:    sw $7, 28($sp)
+; 32R2-NEXT:    sw $6, 24($sp)
+; 32R2-NEXT:    sw $5, 20($sp)
+; 32R2-NEXT:    sw $4, 16($sp)
+; 32R2-NEXT:    sw $1, 12($sp)
+; 32R2-NEXT:    sw $1, 8($sp)
+; 32R2-NEXT:    sw $1, 4($sp)
+; 32R2-NEXT:    sw $1, 0($sp)
 ; 32R2-NEXT:    addiu $1, $sp, 0
 ; 32R2-NEXT:    addiu $1, $1, 16
 ; 32R2-NEXT:    lw $2, 60($sp)
-; 32R2-NEXT:    ext $3, $2, 3, 4
+; 32R2-NEXT:    srl $3, $2, 3
+; 32R2-NEXT:    andi $3, $3, 12
 ; 32R2-NEXT:    subu $1, $1, $3
-; 32R2-NEXT:    lwl $3, 4($1)
-; 32R2-NEXT:    lwr $3, 7($1)
-; 32R2-NEXT:    sll $4, $3, 1
-; 32R2-NEXT:    lwl $5, 8($1)
-; 32R2-NEXT:    lwr $5, 11($1)
-; 32R2-NEXT:    andi $2, $2, 7
-; 32R2-NEXT:    not $6, $2
-; 32R2-NEXT:    srlv $7, $5, $2
-; 32R2-NEXT:    sllv $4, $4, $6
+; 32R2-NEXT:    lw $3, 4($1)
+; 32R2-NEXT:    lw $5, 8($1)
+; 32R2-NEXT:    srlv $4, $5, $2
+; 32R2-NEXT:    sll $6, $3, 1
+; 32R2-NEXT:    andi $7, $2, 31
+; 32R2-NEXT:    xori $7, $7, 31
+; 32R2-NEXT:    sllv $6, $6, $7
 ; 32R2-NEXT:    srlv $3, $3, $2
-; 32R2-NEXT:    lwl $6, 0($1)
-; 32R2-NEXT:    lwr $6, 3($1)
-; 32R2-NEXT:    sll $8, $6, 1
-; 32R2-NEXT:    xori $9, $2, 31
-; 32R2-NEXT:    sllv $8, $8, $9
-; 32R2-NEXT:    or $3, $3, $8
-; 32R2-NEXT:    or $4, $7, $4
-; 32R2-NEXT:    lwl $7, 12($1)
-; 32R2-NEXT:    lwr $7, 15($1)
-; 32R2-NEXT:    srlv $1, $7, $2
+; 32R2-NEXT:    lw $8, 0($1)
+; 32R2-NEXT:    sll $9, $8, 1
+; 32R2-NEXT:    sllv $9, $9, $7
+; 32R2-NEXT:    or $3, $3, $9
+; 32R2-NEXT:    or $4, $4, $6
+; 32R2-NEXT:    lw $1, 12($1)
+; 32R2-NEXT:    srlv $1, $1, $2
 ; 32R2-NEXT:    sll $5, $5, 1
-; 32R2-NEXT:    sllv $5, $5, $9
+; 32R2-NEXT:    sllv $5, $5, $7
 ; 32R2-NEXT:    or $5, $1, $5
-; 32R2-NEXT:    srav $2, $6, $2
+; 32R2-NEXT:    srav $2, $8, $2
 ; 32R2-NEXT:    jr $ra
 ; 32R2-NEXT:    addiu $sp, $sp, 32
 ;
@@ -555,28 +517,28 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; 32R6-NEXT:    addiu $1, $sp, 0
 ; 32R6-NEXT:    addiu $1, $1, 16
 ; 32R6-NEXT:    lw $2, 60($sp)
-; 32R6-NEXT:    ext $3, $2, 3, 4
+; 32R6-NEXT:    srl $3, $2, 3
+; 32R6-NEXT:    andi $3, $3, 12
 ; 32R6-NEXT:    subu $1, $1, $3
 ; 32R6-NEXT:    lw $3, 4($1)
-; 32R6-NEXT:    sll $4, $3, 1
 ; 32R6-NEXT:    lw $5, 8($1)
-; 32R6-NEXT:    andi $2, $2, 7
-; 32R6-NEXT:    not $6, $2
-; 32R6-NEXT:    srlv $7, $5, $2
-; 32R6-NEXT:    sllv $4, $4, $6
+; 32R6-NEXT:    srlv $4, $5, $2
+; 32R6-NEXT:    sll $6, $3, 1
+; 32R6-NEXT:    andi $7, $2, 31
+; 32R6-NEXT:    xori $7, $7, 31
+; 32R6-NEXT:    sllv $6, $6, $7
 ; 32R6-NEXT:    srlv $3, $3, $2
-; 32R6-NEXT:    lw $6, 0($1)
-; 32R6-NEXT:    sll $8, $6, 1
-; 32R6-NEXT:    xori $9, $2, 31
-; 32R6-NEXT:    sllv $8, $8, $9
-; 32R6-NEXT:    or $3, $3, $8
-; 32R6-NEXT:    or $4, $7, $4
+; 32R6-NEXT:    lw $8, 0($1)
+; 32R6-NEXT:    sll $9, $8, 1
+; 32R6-NEXT:    sllv $9, $9, $7
+; 32R6-NEXT:    or $3, $3, $9
+; 32R6-NEXT:    or $4, $4, $6
 ; 32R6-NEXT:    lw $1, 12($1)
 ; 32R6-NEXT:    srlv $1, $1, $2
 ; 32R6-NEXT:    sll $5, $5, 1
-; 32R6-NEXT:    sllv $5, $5, $9
+; 32R6-NEXT:    sllv $5, $5, $7
 ; 32R6-NEXT:    or $5, $1, $5
-; 32R6-NEXT:    srav $2, $6, $2
+; 32R6-NEXT:    srav $2, $8, $2
 ; 32R6-NEXT:    jr $ra
 ; 32R6-NEXT:    addiu $sp, $sp, 32
 ;
@@ -656,53 +618,37 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; MMR3-NEXT:    swp $16, 32($sp)
 ; MMR3-NEXT:    .cfi_offset 17, -4
 ; MMR3-NEXT:    .cfi_offset 16, -8
-; MMR3-NEXT:    swl $7, 28($sp)
-; MMR3-NEXT:    swl $6, 24($sp)
-; MMR3-NEXT:    swl $5, 20($sp)
 ; MMR3-NEXT:    sra $1, $4, 31
-; MMR3-NEXT:    swl $4, 16($sp)
-; MMR3-NEXT:    swl $1, 12($sp)
-; MMR3-NEXT:    swl $1, 8($sp)
-; MMR3-NEXT:    swl $1, 4($sp)
-; MMR3-NEXT:    swl $1, 0($sp)
-; MMR3-NEXT:    swr $7, 31($sp)
-; MMR3-NEXT:    swr $6, 27($sp)
-; MMR3-NEXT:    swr $5, 23($sp)
-; MMR3-NEXT:    swr $4, 19($sp)
-; MMR3-NEXT:    swr $1, 15($sp)
-; MMR3-NEXT:    swr $1, 11($sp)
-; MMR3-NEXT:    swr $1, 7($sp)
-; MMR3-NEXT:    swr $1, 3($sp)
+; MMR3-NEXT:    swp $6, 24($sp)
+; MMR3-NEXT:    swp $4, 16($sp)
+; MMR3-NEXT:    sw $1, 12($sp)
+; MMR3-NEXT:    sw $1, 8($sp)
+; MMR3-NEXT:    sw $1, 4($sp)
+; MMR3-NEXT:    sw $1, 0($sp)
 ; MMR3-NEXT:    addiur1sp $2, 0
 ; MMR3-NEXT:    addiur2 $2, $2, 16
 ; MMR3-NEXT:    lw $3, 68($sp)
-; MMR3-NEXT:    ext $4, $3, 3, 4
-; MMR3-NEXT:    subu16 $2, $2, $4
-; MMR3-NEXT:    lwl $7, 4($2)
-; MMR3-NEXT:    lwr $7, 7($2)
-; MMR3-NEXT:    sll16 $4, $7, 1
-; MMR3-NEXT:    lwl $5, 8($2)
-; MMR3-NEXT:    lwr $5, 11($2)
-; MMR3-NEXT:    andi16 $6, $3, 7
-; MMR3-NEXT:    not16 $3, $6
-; MMR3-NEXT:    andi16 $3, $3, 31
-; MMR3-NEXT:    srlv $16, $5, $6
-; MMR3-NEXT:    sllv $4, $4, $3
-; MMR3-NEXT:    srlv $17, $7, $6
-; MMR3-NEXT:    lwl $7, 0($2)
-; MMR3-NEXT:    lwr $7, 3($2)
-; MMR3-NEXT:    sll16 $3, $7, 1
-; MMR3-NEXT:    xori $1, $6, 31
+; MMR3-NEXT:    srl16 $4, $3, 3
+; MMR3-NEXT:    andi $4, $4, 12
+; MMR3-NEXT:    subu16 $5, $2, $4
+; MMR3-NEXT:    lwp $6, 4($5)
+; MMR3-NEXT:    andi16 $2, $3, 31
+; MMR3-NEXT:    srlv $16, $7, $2
+; MMR3-NEXT:    sll16 $3, $6, 1
+; MMR3-NEXT:    xori $1, $2, 31
+; MMR3-NEXT:    sllv $4, $3, $1
+; MMR3-NEXT:    srlv $6, $6, $2
+; MMR3-NEXT:    lw16 $17, 0($5)
+; MMR3-NEXT:    sll16 $3, $17, 1
 ; MMR3-NEXT:    sllv $3, $3, $1
-; MMR3-NEXT:    or16 $3, $17
+; MMR3-NEXT:    or16 $3, $6
 ; MMR3-NEXT:    or16 $4, $16
-; MMR3-NEXT:    lwl $8, 12($2)
-; MMR3-NEXT:    lwr $8, 15($2)
-; MMR3-NEXT:    srlv $2, $8, $6
-; MMR3-NEXT:    sll16 $5, $5, 1
+; MMR3-NEXT:    lw16 $5, 12($5)
+; MMR3-NEXT:    srlv $6, $5, $2
+; MMR3-NEXT:    sll16 $5, $7, 1
 ; MMR3-NEXT:    sllv $5, $5, $1
-; MMR3-NEXT:    or16 $5, $2
-; MMR3-NEXT:    srav $2, $7, $6
+; MMR3-NEXT:    or16 $5, $6
+; MMR3-NEXT:    srav $2, $17, $2
 ; MMR3-NEXT:    lwp $16, 32($sp)
 ; MMR3-NEXT:    addiusp 40
 ; MMR3-NEXT:    jrc $ra
@@ -714,40 +660,39 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; MMR6-NEXT:    sw $16, 36($sp) # 4-byte Folded Spill
 ; MMR6-NEXT:    .cfi_offset 16, -4
 ; MMR6-NEXT:    sra $1, $4, 31
-; MMR6-NEXT:    sw $7, 32($sp)
-; MMR6-NEXT:    sw $6, 28($sp)
-; MMR6-NEXT:    sw $5, 24($sp)
-; MMR6-NEXT:    sw $4, 20($sp)
-; MMR6-NEXT:    sw $1, 16($sp)
+; MMR6-NEXT:    sw $7, 28($sp)
+; MMR6-NEXT:    sw $6, 24($sp)
+; MMR6-NEXT:    sw $5, 20($sp)
+; MMR6-NEXT:    sw $4, 16($sp)
 ; MMR6-NEXT:    sw $1, 12($sp)
 ; MMR6-NEXT:    sw $1, 8($sp)
 ; MMR6-NEXT:    sw $1, 4($sp)
-; MMR6-NEXT:    addiu $2, $sp, 4
+; MMR6-NEXT:    sw $1, 0($sp)
+; MMR6-NEXT:    addiu $2, $sp, 0
 ; MMR6-NEXT:    addiur2 $2, $2, 16
 ; MMR6-NEXT:    lw $3, 68($sp)
-; MMR6-NEXT:    ext $4, $3, 3, 4
-; MMR6-NEXT:    subu16 $5, $2, $4
-; MMR6-NEXT:    lw16 $4, 4($5)
-; MMR6-NEXT:    sll16 $6, $4, 1
-; MMR6-NEXT:    lw16 $7, 8($5)
-; MMR6-NEXT:    andi16 $2, $3, 7
-; MMR6-NEXT:    not16 $3, $2
-; MMR6-NEXT:    andi16 $3, $3, 31
-; MMR6-NEXT:    srlv $1, $7, $2
-; MMR6-NEXT:    sllv $6, $6, $3
-; MMR6-NEXT:    srlv $3, $4, $2
-; MMR6-NEXT:    lw16 $16, 0($5)
+; MMR6-NEXT:    srl16 $4, $3, 3
+; MMR6-NEXT:    andi $4, $4, 12
+; MMR6-NEXT:    subu16 $2, $2, $4
+; MMR6-NEXT:    lw16 $4, 4($2)
+; MMR6-NEXT:    lw16 $5, 8($2)
+; MMR6-NEXT:    andi16 $6, $3, 31
+; MMR6-NEXT:    srlv $1, $5, $6
+; MMR6-NEXT:    sll16 $3, $4, 1
+; MMR6-NEXT:    xori $7, $6, 31
+; MMR6-NEXT:    sllv $8, $3, $7
+; MMR6-NEXT:    srlv $3, $4, $6
+; MMR6-NEXT:    lw16 $16, 0($2)
 ; MMR6-NEXT:    sll16 $4, $16, 1
-; MMR6-NEXT:    xori $8, $2, 31
-; MMR6-NEXT:    sllv $4, $4, $8
+; MMR6-NEXT:    sllv $4, $4, $7
 ; MMR6-NEXT:    or $3, $3, $4
-; MMR6-NEXT:    or $4, $1, $6
-; MMR6-NEXT:    lw16 $5, 12($5)
-; MMR6-NEXT:    srlv $1, $5, $2
-; MMR6-NEXT:    sll16 $5, $7, 1
-; MMR6-NEXT:    sllv $5, $5, $8
-; MMR6-NEXT:    or $5, $1, $5
-; MMR6-NEXT:    srav $2, $16, $2
+; MMR6-NEXT:    or $4, $1, $8
+; MMR6-NEXT:    lw16 $2, 12($2)
+; MMR6-NEXT:    srlv $1, $2, $6
+; MMR6-NEXT:    sll16 $2, $5, 1
+; MMR6-NEXT:    sllv $2, $2, $7
+; MMR6-NEXT:    or $5, $1, $2
+; MMR6-NEXT:    srav $2, $16, $6
 ; MMR6-NEXT:    lw $16, 36($sp) # 4-byte Folded Reload
 ; MMR6-NEXT:    addiu $sp, $sp, 40
 ; MMR6-NEXT:    jrc $ra

diff  --git a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
index 03cf104e3120c4..69b842c73db1b4 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
@@ -398,52 +398,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS2:       # %bb.0: # %entry
 ; MIPS2-NEXT:    addiu $sp, $sp, -32
 ; MIPS2-NEXT:    .cfi_def_cfa_offset 32
-; MIPS2-NEXT:    swl $7, 28($sp)
-; MIPS2-NEXT:    swl $6, 24($sp)
-; MIPS2-NEXT:    swl $5, 20($sp)
-; MIPS2-NEXT:    swl $4, 16($sp)
-; MIPS2-NEXT:    swl $zero, 12($sp)
-; MIPS2-NEXT:    swl $zero, 8($sp)
-; MIPS2-NEXT:    swl $zero, 4($sp)
-; MIPS2-NEXT:    swl $zero, 0($sp)
 ; MIPS2-NEXT:    addiu $1, $sp, 0
-; MIPS2-NEXT:    swr $7, 31($sp)
-; MIPS2-NEXT:    swr $6, 27($sp)
-; MIPS2-NEXT:    swr $5, 23($sp)
-; MIPS2-NEXT:    swr $4, 19($sp)
-; MIPS2-NEXT:    swr $zero, 15($sp)
-; MIPS2-NEXT:    swr $zero, 11($sp)
-; MIPS2-NEXT:    swr $zero, 7($sp)
-; MIPS2-NEXT:    swr $zero, 3($sp)
+; MIPS2-NEXT:    sw $7, 28($sp)
+; MIPS2-NEXT:    sw $6, 24($sp)
+; MIPS2-NEXT:    sw $5, 20($sp)
+; MIPS2-NEXT:    sw $4, 16($sp)
 ; MIPS2-NEXT:    addiu $1, $1, 16
 ; MIPS2-NEXT:    lw $2, 60($sp)
 ; MIPS2-NEXT:    srl $3, $2, 3
-; MIPS2-NEXT:    andi $3, $3, 15
+; MIPS2-NEXT:    andi $3, $3, 12
 ; MIPS2-NEXT:    subu $1, $1, $3
-; MIPS2-NEXT:    lwl $3, 4($1)
-; MIPS2-NEXT:    lwr $3, 7($1)
-; MIPS2-NEXT:    sll $4, $3, 1
-; MIPS2-NEXT:    lwl $5, 8($1)
-; MIPS2-NEXT:    lwr $5, 11($1)
-; MIPS2-NEXT:    andi $2, $2, 7
-; MIPS2-NEXT:    not $6, $2
-; MIPS2-NEXT:    srlv $7, $5, $2
-; MIPS2-NEXT:    sllv $4, $4, $6
+; MIPS2-NEXT:    sw $zero, 12($sp)
+; MIPS2-NEXT:    sw $zero, 8($sp)
+; MIPS2-NEXT:    sw $zero, 4($sp)
+; MIPS2-NEXT:    sw $zero, 0($sp)
+; MIPS2-NEXT:    lw $3, 4($1)
+; MIPS2-NEXT:    lw $5, 8($1)
+; MIPS2-NEXT:    srlv $4, $5, $2
+; MIPS2-NEXT:    sll $6, $3, 1
+; MIPS2-NEXT:    andi $7, $2, 31
+; MIPS2-NEXT:    xori $7, $7, 31
+; MIPS2-NEXT:    sllv $6, $6, $7
 ; MIPS2-NEXT:    srlv $3, $3, $2
-; MIPS2-NEXT:    lwl $6, 0($1)
-; MIPS2-NEXT:    lwr $6, 3($1)
-; MIPS2-NEXT:    sll $8, $6, 1
-; MIPS2-NEXT:    xori $9, $2, 31
-; MIPS2-NEXT:    sllv $8, $8, $9
-; MIPS2-NEXT:    or $3, $3, $8
-; MIPS2-NEXT:    or $4, $7, $4
-; MIPS2-NEXT:    lwl $7, 12($1)
-; MIPS2-NEXT:    lwr $7, 15($1)
-; MIPS2-NEXT:    srlv $1, $7, $2
+; MIPS2-NEXT:    lw $8, 0($1)
+; MIPS2-NEXT:    sll $9, $8, 1
+; MIPS2-NEXT:    sllv $9, $9, $7
+; MIPS2-NEXT:    or $3, $3, $9
+; MIPS2-NEXT:    or $4, $4, $6
+; MIPS2-NEXT:    lw $1, 12($1)
+; MIPS2-NEXT:    srlv $1, $1, $2
 ; MIPS2-NEXT:    sll $5, $5, 1
-; MIPS2-NEXT:    sllv $5, $5, $9
+; MIPS2-NEXT:    sllv $5, $5, $7
 ; MIPS2-NEXT:    or $5, $1, $5
-; MIPS2-NEXT:    srlv $2, $6, $2
+; MIPS2-NEXT:    srlv $2, $8, $2
 ; MIPS2-NEXT:    jr $ra
 ; MIPS2-NEXT:    addiu $sp, $sp, 32
 ;
@@ -451,52 +438,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    addiu $sp, $sp, -32
 ; MIPS32-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32-NEXT:    swl $7, 28($sp)
-; MIPS32-NEXT:    swl $6, 24($sp)
-; MIPS32-NEXT:    swl $5, 20($sp)
-; MIPS32-NEXT:    swl $4, 16($sp)
-; MIPS32-NEXT:    swl $zero, 12($sp)
-; MIPS32-NEXT:    swl $zero, 8($sp)
-; MIPS32-NEXT:    swl $zero, 4($sp)
-; MIPS32-NEXT:    swl $zero, 0($sp)
 ; MIPS32-NEXT:    addiu $1, $sp, 0
-; MIPS32-NEXT:    swr $7, 31($sp)
-; MIPS32-NEXT:    swr $6, 27($sp)
-; MIPS32-NEXT:    swr $5, 23($sp)
-; MIPS32-NEXT:    swr $4, 19($sp)
-; MIPS32-NEXT:    swr $zero, 15($sp)
-; MIPS32-NEXT:    swr $zero, 11($sp)
-; MIPS32-NEXT:    swr $zero, 7($sp)
-; MIPS32-NEXT:    swr $zero, 3($sp)
+; MIPS32-NEXT:    sw $7, 28($sp)
+; MIPS32-NEXT:    sw $6, 24($sp)
+; MIPS32-NEXT:    sw $5, 20($sp)
+; MIPS32-NEXT:    sw $4, 16($sp)
 ; MIPS32-NEXT:    addiu $1, $1, 16
 ; MIPS32-NEXT:    lw $2, 60($sp)
 ; MIPS32-NEXT:    srl $3, $2, 3
-; MIPS32-NEXT:    andi $3, $3, 15
+; MIPS32-NEXT:    andi $3, $3, 12
 ; MIPS32-NEXT:    subu $1, $1, $3
-; MIPS32-NEXT:    lwl $3, 4($1)
-; MIPS32-NEXT:    lwr $3, 7($1)
-; MIPS32-NEXT:    sll $4, $3, 1
-; MIPS32-NEXT:    lwl $5, 8($1)
-; MIPS32-NEXT:    lwr $5, 11($1)
-; MIPS32-NEXT:    andi $2, $2, 7
-; MIPS32-NEXT:    not $6, $2
-; MIPS32-NEXT:    srlv $7, $5, $2
-; MIPS32-NEXT:    sllv $4, $4, $6
+; MIPS32-NEXT:    sw $zero, 12($sp)
+; MIPS32-NEXT:    sw $zero, 8($sp)
+; MIPS32-NEXT:    sw $zero, 4($sp)
+; MIPS32-NEXT:    sw $zero, 0($sp)
+; MIPS32-NEXT:    lw $3, 4($1)
+; MIPS32-NEXT:    lw $5, 8($1)
+; MIPS32-NEXT:    srlv $4, $5, $2
+; MIPS32-NEXT:    sll $6, $3, 1
+; MIPS32-NEXT:    andi $7, $2, 31
+; MIPS32-NEXT:    xori $7, $7, 31
+; MIPS32-NEXT:    sllv $6, $6, $7
 ; MIPS32-NEXT:    srlv $3, $3, $2
-; MIPS32-NEXT:    lwl $6, 0($1)
-; MIPS32-NEXT:    lwr $6, 3($1)
-; MIPS32-NEXT:    sll $8, $6, 1
-; MIPS32-NEXT:    xori $9, $2, 31
-; MIPS32-NEXT:    sllv $8, $8, $9
-; MIPS32-NEXT:    or $3, $3, $8
-; MIPS32-NEXT:    or $4, $7, $4
-; MIPS32-NEXT:    lwl $7, 12($1)
-; MIPS32-NEXT:    lwr $7, 15($1)
-; MIPS32-NEXT:    srlv $1, $7, $2
+; MIPS32-NEXT:    lw $8, 0($1)
+; MIPS32-NEXT:    sll $9, $8, 1
+; MIPS32-NEXT:    sllv $9, $9, $7
+; MIPS32-NEXT:    or $3, $3, $9
+; MIPS32-NEXT:    or $4, $4, $6
+; MIPS32-NEXT:    lw $1, 12($1)
+; MIPS32-NEXT:    srlv $1, $1, $2
 ; MIPS32-NEXT:    sll $5, $5, 1
-; MIPS32-NEXT:    sllv $5, $5, $9
+; MIPS32-NEXT:    sllv $5, $5, $7
 ; MIPS32-NEXT:    or $5, $1, $5
-; MIPS32-NEXT:    srlv $2, $6, $2
+; MIPS32-NEXT:    srlv $2, $8, $2
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    addiu $sp, $sp, 32
 ;
@@ -504,51 +478,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32R2:       # %bb.0: # %entry
 ; MIPS32R2-NEXT:    addiu $sp, $sp, -32
 ; MIPS32R2-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32R2-NEXT:    swl $7, 28($sp)
-; MIPS32R2-NEXT:    swl $6, 24($sp)
-; MIPS32R2-NEXT:    swl $5, 20($sp)
-; MIPS32R2-NEXT:    swl $4, 16($sp)
-; MIPS32R2-NEXT:    swl $zero, 12($sp)
-; MIPS32R2-NEXT:    swl $zero, 8($sp)
-; MIPS32R2-NEXT:    swl $zero, 4($sp)
-; MIPS32R2-NEXT:    swl $zero, 0($sp)
-; MIPS32R2-NEXT:    swr $7, 31($sp)
-; MIPS32R2-NEXT:    swr $6, 27($sp)
-; MIPS32R2-NEXT:    swr $5, 23($sp)
-; MIPS32R2-NEXT:    swr $4, 19($sp)
-; MIPS32R2-NEXT:    swr $zero, 15($sp)
-; MIPS32R2-NEXT:    swr $zero, 11($sp)
-; MIPS32R2-NEXT:    swr $zero, 7($sp)
-; MIPS32R2-NEXT:    swr $zero, 3($sp)
 ; MIPS32R2-NEXT:    addiu $1, $sp, 0
+; MIPS32R2-NEXT:    sw $7, 28($sp)
+; MIPS32R2-NEXT:    sw $6, 24($sp)
+; MIPS32R2-NEXT:    sw $5, 20($sp)
+; MIPS32R2-NEXT:    sw $4, 16($sp)
 ; MIPS32R2-NEXT:    addiu $1, $1, 16
 ; MIPS32R2-NEXT:    lw $2, 60($sp)
-; MIPS32R2-NEXT:    ext $3, $2, 3, 4
+; MIPS32R2-NEXT:    srl $3, $2, 3
+; MIPS32R2-NEXT:    andi $3, $3, 12
 ; MIPS32R2-NEXT:    subu $1, $1, $3
-; MIPS32R2-NEXT:    lwl $3, 4($1)
-; MIPS32R2-NEXT:    lwr $3, 7($1)
-; MIPS32R2-NEXT:    sll $4, $3, 1
-; MIPS32R2-NEXT:    lwl $5, 8($1)
-; MIPS32R2-NEXT:    lwr $5, 11($1)
-; MIPS32R2-NEXT:    andi $2, $2, 7
-; MIPS32R2-NEXT:    not $6, $2
-; MIPS32R2-NEXT:    srlv $7, $5, $2
-; MIPS32R2-NEXT:    sllv $4, $4, $6
+; MIPS32R2-NEXT:    sw $zero, 12($sp)
+; MIPS32R2-NEXT:    sw $zero, 8($sp)
+; MIPS32R2-NEXT:    sw $zero, 4($sp)
+; MIPS32R2-NEXT:    sw $zero, 0($sp)
+; MIPS32R2-NEXT:    lw $3, 4($1)
+; MIPS32R2-NEXT:    lw $5, 8($1)
+; MIPS32R2-NEXT:    srlv $4, $5, $2
+; MIPS32R2-NEXT:    sll $6, $3, 1
+; MIPS32R2-NEXT:    andi $7, $2, 31
+; MIPS32R2-NEXT:    xori $7, $7, 31
+; MIPS32R2-NEXT:    sllv $6, $6, $7
 ; MIPS32R2-NEXT:    srlv $3, $3, $2
-; MIPS32R2-NEXT:    lwl $6, 0($1)
-; MIPS32R2-NEXT:    lwr $6, 3($1)
-; MIPS32R2-NEXT:    sll $8, $6, 1
-; MIPS32R2-NEXT:    xori $9, $2, 31
-; MIPS32R2-NEXT:    sllv $8, $8, $9
-; MIPS32R2-NEXT:    or $3, $3, $8
-; MIPS32R2-NEXT:    or $4, $7, $4
-; MIPS32R2-NEXT:    lwl $7, 12($1)
-; MIPS32R2-NEXT:    lwr $7, 15($1)
-; MIPS32R2-NEXT:    srlv $1, $7, $2
+; MIPS32R2-NEXT:    lw $8, 0($1)
+; MIPS32R2-NEXT:    sll $9, $8, 1
+; MIPS32R2-NEXT:    sllv $9, $9, $7
+; MIPS32R2-NEXT:    or $3, $3, $9
+; MIPS32R2-NEXT:    or $4, $4, $6
+; MIPS32R2-NEXT:    lw $1, 12($1)
+; MIPS32R2-NEXT:    srlv $1, $1, $2
 ; MIPS32R2-NEXT:    sll $5, $5, 1
-; MIPS32R2-NEXT:    sllv $5, $5, $9
+; MIPS32R2-NEXT:    sllv $5, $5, $7
 ; MIPS32R2-NEXT:    or $5, $1, $5
-; MIPS32R2-NEXT:    srlv $2, $6, $2
+; MIPS32R2-NEXT:    srlv $2, $8, $2
 ; MIPS32R2-NEXT:    jr $ra
 ; MIPS32R2-NEXT:    addiu $sp, $sp, 32
 ;
@@ -563,32 +525,32 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32R6-NEXT:    sw $4, 16($sp)
 ; MIPS32R6-NEXT:    addiu $1, $1, 16
 ; MIPS32R6-NEXT:    lw $2, 60($sp)
-; MIPS32R6-NEXT:    ext $3, $2, 3, 4
+; MIPS32R6-NEXT:    srl $3, $2, 3
+; MIPS32R6-NEXT:    andi $3, $3, 12
 ; MIPS32R6-NEXT:    subu $1, $1, $3
 ; MIPS32R6-NEXT:    sw $zero, 12($sp)
 ; MIPS32R6-NEXT:    sw $zero, 8($sp)
 ; MIPS32R6-NEXT:    sw $zero, 4($sp)
 ; MIPS32R6-NEXT:    sw $zero, 0($sp)
 ; MIPS32R6-NEXT:    lw $3, 4($1)
-; MIPS32R6-NEXT:    sll $4, $3, 1
 ; MIPS32R6-NEXT:    lw $5, 8($1)
-; MIPS32R6-NEXT:    andi $2, $2, 7
-; MIPS32R6-NEXT:    not $6, $2
-; MIPS32R6-NEXT:    srlv $7, $5, $2
-; MIPS32R6-NEXT:    sllv $4, $4, $6
+; MIPS32R6-NEXT:    srlv $4, $5, $2
+; MIPS32R6-NEXT:    sll $6, $3, 1
+; MIPS32R6-NEXT:    andi $7, $2, 31
+; MIPS32R6-NEXT:    xori $7, $7, 31
+; MIPS32R6-NEXT:    sllv $6, $6, $7
 ; MIPS32R6-NEXT:    srlv $3, $3, $2
-; MIPS32R6-NEXT:    lw $6, 0($1)
-; MIPS32R6-NEXT:    sll $8, $6, 1
-; MIPS32R6-NEXT:    xori $9, $2, 31
-; MIPS32R6-NEXT:    sllv $8, $8, $9
-; MIPS32R6-NEXT:    or $3, $3, $8
-; MIPS32R6-NEXT:    or $4, $7, $4
+; MIPS32R6-NEXT:    lw $8, 0($1)
+; MIPS32R6-NEXT:    sll $9, $8, 1
+; MIPS32R6-NEXT:    sllv $9, $9, $7
+; MIPS32R6-NEXT:    or $3, $3, $9
+; MIPS32R6-NEXT:    or $4, $4, $6
 ; MIPS32R6-NEXT:    lw $1, 12($1)
 ; MIPS32R6-NEXT:    srlv $1, $1, $2
 ; MIPS32R6-NEXT:    sll $5, $5, 1
-; MIPS32R6-NEXT:    sllv $5, $5, $9
+; MIPS32R6-NEXT:    sllv $5, $5, $7
 ; MIPS32R6-NEXT:    or $5, $1, $5
-; MIPS32R6-NEXT:    srlv $2, $6, $2
+; MIPS32R6-NEXT:    srlv $2, $8, $2
 ; MIPS32R6-NEXT:    jr $ra
 ; MIPS32R6-NEXT:    addiu $sp, $sp, 32
 ;
@@ -677,53 +639,37 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MMR3-NEXT:    swp $16, 32($sp)
 ; MMR3-NEXT:    .cfi_offset 17, -4
 ; MMR3-NEXT:    .cfi_offset 16, -8
-; MMR3-NEXT:    swl $7, 28($sp)
-; MMR3-NEXT:    swl $6, 24($sp)
-; MMR3-NEXT:    swl $5, 20($sp)
 ; MMR3-NEXT:    li16 $2, 0
-; MMR3-NEXT:    swl $4, 16($sp)
-; MMR3-NEXT:    swl $2, 12($sp)
-; MMR3-NEXT:    swl $2, 8($sp)
-; MMR3-NEXT:    swl $2, 4($sp)
-; MMR3-NEXT:    swl $2, 0($sp)
-; MMR3-NEXT:    swr $7, 31($sp)
-; MMR3-NEXT:    swr $6, 27($sp)
-; MMR3-NEXT:    swr $5, 23($sp)
-; MMR3-NEXT:    swr $4, 19($sp)
-; MMR3-NEXT:    swr $2, 15($sp)
-; MMR3-NEXT:    swr $2, 11($sp)
-; MMR3-NEXT:    swr $2, 7($sp)
-; MMR3-NEXT:    swr $2, 3($sp)
+; MMR3-NEXT:    swp $6, 24($sp)
+; MMR3-NEXT:    swp $4, 16($sp)
+; MMR3-NEXT:    sw $2, 12($sp)
+; MMR3-NEXT:    sw $2, 8($sp)
+; MMR3-NEXT:    sw $2, 4($sp)
+; MMR3-NEXT:    sw $2, 0($sp)
 ; MMR3-NEXT:    addiur1sp $2, 0
 ; MMR3-NEXT:    addiur2 $2, $2, 16
 ; MMR3-NEXT:    lw $3, 68($sp)
-; MMR3-NEXT:    ext $4, $3, 3, 4
-; MMR3-NEXT:    subu16 $2, $2, $4
-; MMR3-NEXT:    lwl $7, 4($2)
-; MMR3-NEXT:    lwr $7, 7($2)
-; MMR3-NEXT:    sll16 $4, $7, 1
-; MMR3-NEXT:    lwl $5, 8($2)
-; MMR3-NEXT:    lwr $5, 11($2)
-; MMR3-NEXT:    andi16 $6, $3, 7
-; MMR3-NEXT:    not16 $3, $6
-; MMR3-NEXT:    andi16 $3, $3, 31
-; MMR3-NEXT:    srlv $16, $5, $6
-; MMR3-NEXT:    sllv $4, $4, $3
-; MMR3-NEXT:    srlv $17, $7, $6
-; MMR3-NEXT:    lwl $7, 0($2)
-; MMR3-NEXT:    lwr $7, 3($2)
-; MMR3-NEXT:    sll16 $3, $7, 1
-; MMR3-NEXT:    xori $1, $6, 31
+; MMR3-NEXT:    srl16 $4, $3, 3
+; MMR3-NEXT:    andi $4, $4, 12
+; MMR3-NEXT:    subu16 $5, $2, $4
+; MMR3-NEXT:    lwp $6, 4($5)
+; MMR3-NEXT:    andi16 $2, $3, 31
+; MMR3-NEXT:    srlv $16, $7, $2
+; MMR3-NEXT:    sll16 $3, $6, 1
+; MMR3-NEXT:    xori $1, $2, 31
+; MMR3-NEXT:    sllv $4, $3, $1
+; MMR3-NEXT:    srlv $6, $6, $2
+; MMR3-NEXT:    lw16 $17, 0($5)
+; MMR3-NEXT:    sll16 $3, $17, 1
 ; MMR3-NEXT:    sllv $3, $3, $1
-; MMR3-NEXT:    or16 $3, $17
+; MMR3-NEXT:    or16 $3, $6
 ; MMR3-NEXT:    or16 $4, $16
-; MMR3-NEXT:    lwl $8, 12($2)
-; MMR3-NEXT:    lwr $8, 15($2)
-; MMR3-NEXT:    srlv $2, $8, $6
-; MMR3-NEXT:    sll16 $5, $5, 1
+; MMR3-NEXT:    lw16 $5, 12($5)
+; MMR3-NEXT:    srlv $6, $5, $2
+; MMR3-NEXT:    sll16 $5, $7, 1
 ; MMR3-NEXT:    sllv $5, $5, $1
-; MMR3-NEXT:    or16 $5, $2
-; MMR3-NEXT:    srlv $2, $7, $6
+; MMR3-NEXT:    or16 $5, $6
+; MMR3-NEXT:    srlv $2, $17, $2
 ; MMR3-NEXT:    lwp $16, 32($sp)
 ; MMR3-NEXT:    addiusp 40
 ; MMR3-NEXT:    jrc $ra
@@ -735,40 +681,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MMR6-NEXT:    sw $16, 36($sp) # 4-byte Folded Spill
 ; MMR6-NEXT:    .cfi_offset 16, -4
 ; MMR6-NEXT:    li16 $2, 0
-; MMR6-NEXT:    sw $7, 32($sp)
-; MMR6-NEXT:    sw $6, 28($sp)
-; MMR6-NEXT:    sw $5, 24($sp)
-; MMR6-NEXT:    sw $4, 20($sp)
-; MMR6-NEXT:    sw $2, 16($sp)
+; MMR6-NEXT:    sw $7, 28($sp)
+; MMR6-NEXT:    sw $6, 24($sp)
+; MMR6-NEXT:    sw $5, 20($sp)
+; MMR6-NEXT:    sw $4, 16($sp)
 ; MMR6-NEXT:    sw $2, 12($sp)
 ; MMR6-NEXT:    sw $2, 8($sp)
 ; MMR6-NEXT:    sw $2, 4($sp)
-; MMR6-NEXT:    addiu $2, $sp, 4
+; MMR6-NEXT:    sw $2, 0($sp)
+; MMR6-NEXT:    addiu $2, $sp, 0
 ; MMR6-NEXT:    addiur2 $2, $2, 16
 ; MMR6-NEXT:    lw $3, 68($sp)
-; MMR6-NEXT:    ext $4, $3, 3, 4
-; MMR6-NEXT:    subu16 $5, $2, $4
-; MMR6-NEXT:    lw16 $4, 4($5)
-; MMR6-NEXT:    sll16 $6, $4, 1
-; MMR6-NEXT:    lw16 $7, 8($5)
-; MMR6-NEXT:    andi16 $2, $3, 7
-; MMR6-NEXT:    not16 $3, $2
-; MMR6-NEXT:    andi16 $3, $3, 31
-; MMR6-NEXT:    srlv $1, $7, $2
-; MMR6-NEXT:    sllv $6, $6, $3
-; MMR6-NEXT:    srlv $3, $4, $2
-; MMR6-NEXT:    lw16 $16, 0($5)
+; MMR6-NEXT:    srl16 $4, $3, 3
+; MMR6-NEXT:    andi $4, $4, 12
+; MMR6-NEXT:    subu16 $2, $2, $4
+; MMR6-NEXT:    lw16 $4, 4($2)
+; MMR6-NEXT:    lw16 $5, 8($2)
+; MMR6-NEXT:    andi16 $6, $3, 31
+; MMR6-NEXT:    srlv $1, $5, $6
+; MMR6-NEXT:    sll16 $3, $4, 1
+; MMR6-NEXT:    xori $7, $6, 31
+; MMR6-NEXT:    sllv $8, $3, $7
+; MMR6-NEXT:    srlv $3, $4, $6
+; MMR6-NEXT:    lw16 $16, 0($2)
 ; MMR6-NEXT:    sll16 $4, $16, 1
-; MMR6-NEXT:    xori $8, $2, 31
-; MMR6-NEXT:    sllv $4, $4, $8
+; MMR6-NEXT:    sllv $4, $4, $7
 ; MMR6-NEXT:    or $3, $3, $4
-; MMR6-NEXT:    or $4, $1, $6
-; MMR6-NEXT:    lw16 $5, 12($5)
-; MMR6-NEXT:    srlv $1, $5, $2
-; MMR6-NEXT:    sll16 $5, $7, 1
-; MMR6-NEXT:    sllv $5, $5, $8
-; MMR6-NEXT:    or $5, $1, $5
-; MMR6-NEXT:    srlv $2, $16, $2
+; MMR6-NEXT:    or $4, $1, $8
+; MMR6-NEXT:    lw16 $2, 12($2)
+; MMR6-NEXT:    srlv $1, $2, $6
+; MMR6-NEXT:    sll16 $2, $5, 1
+; MMR6-NEXT:    sllv $2, $2, $7
+; MMR6-NEXT:    or $5, $1, $2
+; MMR6-NEXT:    srlv $2, $16, $6
 ; MMR6-NEXT:    lw $16, 36($sp) # 4-byte Folded Reload
 ; MMR6-NEXT:    addiu $sp, $sp, 40
 ; MMR6-NEXT:    jrc $ra

diff  --git a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
index 81f089a5294708..394890a9dcc7c4 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
@@ -440,49 +440,36 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS2:       # %bb.0: # %entry
 ; MIPS2-NEXT:    addiu $sp, $sp, -32
 ; MIPS2-NEXT:    .cfi_def_cfa_offset 32
-; MIPS2-NEXT:    swl $zero, 28($sp)
-; MIPS2-NEXT:    swl $zero, 24($sp)
-; MIPS2-NEXT:    swl $zero, 20($sp)
-; MIPS2-NEXT:    swl $zero, 16($sp)
-; MIPS2-NEXT:    swl $7, 12($sp)
-; MIPS2-NEXT:    swl $6, 8($sp)
-; MIPS2-NEXT:    swl $5, 4($sp)
-; MIPS2-NEXT:    swl $4, 0($sp)
-; MIPS2-NEXT:    swr $zero, 31($sp)
-; MIPS2-NEXT:    swr $zero, 27($sp)
-; MIPS2-NEXT:    swr $zero, 23($sp)
-; MIPS2-NEXT:    swr $zero, 19($sp)
-; MIPS2-NEXT:    swr $7, 15($sp)
-; MIPS2-NEXT:    swr $6, 11($sp)
-; MIPS2-NEXT:    swr $5, 7($sp)
-; MIPS2-NEXT:    swr $4, 3($sp)
 ; MIPS2-NEXT:    lw $1, 60($sp)
 ; MIPS2-NEXT:    srl $2, $1, 3
-; MIPS2-NEXT:    andi $2, $2, 15
+; MIPS2-NEXT:    sw $7, 12($sp)
+; MIPS2-NEXT:    sw $6, 8($sp)
+; MIPS2-NEXT:    sw $5, 4($sp)
+; MIPS2-NEXT:    sw $4, 0($sp)
+; MIPS2-NEXT:    andi $2, $2, 12
 ; MIPS2-NEXT:    addiu $3, $sp, 0
 ; MIPS2-NEXT:    addu $4, $3, $2
-; MIPS2-NEXT:    lwl $5, 8($4)
-; MIPS2-NEXT:    lwr $5, 11($4)
-; MIPS2-NEXT:    srl $2, $5, 1
-; MIPS2-NEXT:    lwl $3, 4($4)
-; MIPS2-NEXT:    lwr $3, 7($4)
-; MIPS2-NEXT:    andi $1, $1, 7
-; MIPS2-NEXT:    not $6, $1
-; MIPS2-NEXT:    sllv $7, $3, $1
-; MIPS2-NEXT:    srlv $6, $2, $6
-; MIPS2-NEXT:    lwl $2, 0($4)
-; MIPS2-NEXT:    lwr $2, 3($4)
-; MIPS2-NEXT:    sllv $2, $2, $1
-; MIPS2-NEXT:    srl $3, $3, 1
-; MIPS2-NEXT:    xori $8, $1, 31
-; MIPS2-NEXT:    srlv $3, $3, $8
-; MIPS2-NEXT:    or $2, $2, $3
-; MIPS2-NEXT:    or $3, $7, $6
+; MIPS2-NEXT:    sw $zero, 28($sp)
+; MIPS2-NEXT:    sw $zero, 24($sp)
+; MIPS2-NEXT:    sw $zero, 20($sp)
+; MIPS2-NEXT:    sw $zero, 16($sp)
+; MIPS2-NEXT:    lw $5, 8($4)
+; MIPS2-NEXT:    lw $2, 4($4)
+; MIPS2-NEXT:    sllv $3, $2, $1
+; MIPS2-NEXT:    srl $6, $5, 1
+; MIPS2-NEXT:    andi $7, $1, 31
+; MIPS2-NEXT:    xori $7, $7, 31
+; MIPS2-NEXT:    srlv $6, $6, $7
+; MIPS2-NEXT:    lw $8, 0($4)
+; MIPS2-NEXT:    sllv $8, $8, $1
+; MIPS2-NEXT:    srl $2, $2, 1
+; MIPS2-NEXT:    srlv $2, $2, $7
+; MIPS2-NEXT:    or $2, $8, $2
+; MIPS2-NEXT:    or $3, $3, $6
 ; MIPS2-NEXT:    sllv $5, $5, $1
-; MIPS2-NEXT:    lwl $6, 12($4)
-; MIPS2-NEXT:    lwr $6, 15($4)
+; MIPS2-NEXT:    lw $6, 12($4)
 ; MIPS2-NEXT:    srl $4, $6, 1
-; MIPS2-NEXT:    srlv $4, $4, $8
+; MIPS2-NEXT:    srlv $4, $4, $7
 ; MIPS2-NEXT:    or $4, $5, $4
 ; MIPS2-NEXT:    sllv $5, $6, $1
 ; MIPS2-NEXT:    jr $ra
@@ -492,49 +479,36 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    addiu $sp, $sp, -32
 ; MIPS32-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32-NEXT:    swl $zero, 28($sp)
-; MIPS32-NEXT:    swl $zero, 24($sp)
-; MIPS32-NEXT:    swl $zero, 20($sp)
-; MIPS32-NEXT:    swl $zero, 16($sp)
-; MIPS32-NEXT:    swl $7, 12($sp)
-; MIPS32-NEXT:    swl $6, 8($sp)
-; MIPS32-NEXT:    swl $5, 4($sp)
-; MIPS32-NEXT:    swl $4, 0($sp)
-; MIPS32-NEXT:    swr $zero, 31($sp)
-; MIPS32-NEXT:    swr $zero, 27($sp)
-; MIPS32-NEXT:    swr $zero, 23($sp)
-; MIPS32-NEXT:    swr $zero, 19($sp)
-; MIPS32-NEXT:    swr $7, 15($sp)
-; MIPS32-NEXT:    swr $6, 11($sp)
-; MIPS32-NEXT:    swr $5, 7($sp)
-; MIPS32-NEXT:    swr $4, 3($sp)
 ; MIPS32-NEXT:    lw $1, 60($sp)
 ; MIPS32-NEXT:    srl $2, $1, 3
-; MIPS32-NEXT:    andi $2, $2, 15
+; MIPS32-NEXT:    sw $7, 12($sp)
+; MIPS32-NEXT:    sw $6, 8($sp)
+; MIPS32-NEXT:    sw $5, 4($sp)
+; MIPS32-NEXT:    sw $4, 0($sp)
+; MIPS32-NEXT:    andi $2, $2, 12
 ; MIPS32-NEXT:    addiu $3, $sp, 0
 ; MIPS32-NEXT:    addu $4, $3, $2
-; MIPS32-NEXT:    lwl $5, 8($4)
-; MIPS32-NEXT:    lwr $5, 11($4)
-; MIPS32-NEXT:    srl $2, $5, 1
-; MIPS32-NEXT:    lwl $3, 4($4)
-; MIPS32-NEXT:    lwr $3, 7($4)
-; MIPS32-NEXT:    andi $1, $1, 7
-; MIPS32-NEXT:    not $6, $1
-; MIPS32-NEXT:    sllv $7, $3, $1
-; MIPS32-NEXT:    srlv $6, $2, $6
-; MIPS32-NEXT:    lwl $2, 0($4)
-; MIPS32-NEXT:    lwr $2, 3($4)
-; MIPS32-NEXT:    sllv $2, $2, $1
-; MIPS32-NEXT:    srl $3, $3, 1
-; MIPS32-NEXT:    xori $8, $1, 31
-; MIPS32-NEXT:    srlv $3, $3, $8
-; MIPS32-NEXT:    or $2, $2, $3
-; MIPS32-NEXT:    or $3, $7, $6
+; MIPS32-NEXT:    sw $zero, 28($sp)
+; MIPS32-NEXT:    sw $zero, 24($sp)
+; MIPS32-NEXT:    sw $zero, 20($sp)
+; MIPS32-NEXT:    sw $zero, 16($sp)
+; MIPS32-NEXT:    lw $5, 8($4)
+; MIPS32-NEXT:    lw $2, 4($4)
+; MIPS32-NEXT:    sllv $3, $2, $1
+; MIPS32-NEXT:    srl $6, $5, 1
+; MIPS32-NEXT:    andi $7, $1, 31
+; MIPS32-NEXT:    xori $7, $7, 31
+; MIPS32-NEXT:    srlv $6, $6, $7
+; MIPS32-NEXT:    lw $8, 0($4)
+; MIPS32-NEXT:    sllv $8, $8, $1
+; MIPS32-NEXT:    srl $2, $2, 1
+; MIPS32-NEXT:    srlv $2, $2, $7
+; MIPS32-NEXT:    or $2, $8, $2
+; MIPS32-NEXT:    or $3, $3, $6
 ; MIPS32-NEXT:    sllv $5, $5, $1
-; MIPS32-NEXT:    lwl $6, 12($4)
-; MIPS32-NEXT:    lwr $6, 15($4)
+; MIPS32-NEXT:    lw $6, 12($4)
 ; MIPS32-NEXT:    srl $4, $6, 1
-; MIPS32-NEXT:    srlv $4, $4, $8
+; MIPS32-NEXT:    srlv $4, $4, $7
 ; MIPS32-NEXT:    or $4, $5, $4
 ; MIPS32-NEXT:    sllv $5, $6, $1
 ; MIPS32-NEXT:    jr $ra
@@ -544,48 +518,36 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32R2:       # %bb.0: # %entry
 ; MIPS32R2-NEXT:    addiu $sp, $sp, -32
 ; MIPS32R2-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32R2-NEXT:    swl $zero, 28($sp)
-; MIPS32R2-NEXT:    swl $zero, 24($sp)
-; MIPS32R2-NEXT:    swl $zero, 20($sp)
-; MIPS32R2-NEXT:    swl $zero, 16($sp)
-; MIPS32R2-NEXT:    swl $7, 12($sp)
-; MIPS32R2-NEXT:    swl $6, 8($sp)
-; MIPS32R2-NEXT:    swl $5, 4($sp)
-; MIPS32R2-NEXT:    swl $4, 0($sp)
-; MIPS32R2-NEXT:    swr $zero, 31($sp)
-; MIPS32R2-NEXT:    swr $zero, 27($sp)
-; MIPS32R2-NEXT:    swr $zero, 23($sp)
-; MIPS32R2-NEXT:    swr $zero, 19($sp)
-; MIPS32R2-NEXT:    swr $7, 15($sp)
-; MIPS32R2-NEXT:    swr $6, 11($sp)
-; MIPS32R2-NEXT:    swr $5, 7($sp)
-; MIPS32R2-NEXT:    swr $4, 3($sp)
 ; MIPS32R2-NEXT:    lw $1, 60($sp)
-; MIPS32R2-NEXT:    ext $2, $1, 3, 4
+; MIPS32R2-NEXT:    srl $2, $1, 3
+; MIPS32R2-NEXT:    sw $7, 12($sp)
+; MIPS32R2-NEXT:    sw $6, 8($sp)
+; MIPS32R2-NEXT:    sw $5, 4($sp)
+; MIPS32R2-NEXT:    sw $4, 0($sp)
+; MIPS32R2-NEXT:    andi $2, $2, 12
 ; MIPS32R2-NEXT:    addiu $3, $sp, 0
 ; MIPS32R2-NEXT:    addu $4, $3, $2
-; MIPS32R2-NEXT:    lwl $5, 8($4)
-; MIPS32R2-NEXT:    lwr $5, 11($4)
-; MIPS32R2-NEXT:    srl $2, $5, 1
-; MIPS32R2-NEXT:    lwl $3, 4($4)
-; MIPS32R2-NEXT:    lwr $3, 7($4)
-; MIPS32R2-NEXT:    andi $1, $1, 7
-; MIPS32R2-NEXT:    not $6, $1
-; MIPS32R2-NEXT:    sllv $7, $3, $1
-; MIPS32R2-NEXT:    srlv $6, $2, $6
-; MIPS32R2-NEXT:    lwl $2, 0($4)
-; MIPS32R2-NEXT:    lwr $2, 3($4)
-; MIPS32R2-NEXT:    sllv $2, $2, $1
-; MIPS32R2-NEXT:    srl $3, $3, 1
-; MIPS32R2-NEXT:    xori $8, $1, 31
-; MIPS32R2-NEXT:    srlv $3, $3, $8
-; MIPS32R2-NEXT:    or $2, $2, $3
-; MIPS32R2-NEXT:    or $3, $7, $6
+; MIPS32R2-NEXT:    sw $zero, 28($sp)
+; MIPS32R2-NEXT:    sw $zero, 24($sp)
+; MIPS32R2-NEXT:    sw $zero, 20($sp)
+; MIPS32R2-NEXT:    sw $zero, 16($sp)
+; MIPS32R2-NEXT:    lw $5, 8($4)
+; MIPS32R2-NEXT:    lw $2, 4($4)
+; MIPS32R2-NEXT:    sllv $3, $2, $1
+; MIPS32R2-NEXT:    srl $6, $5, 1
+; MIPS32R2-NEXT:    andi $7, $1, 31
+; MIPS32R2-NEXT:    xori $7, $7, 31
+; MIPS32R2-NEXT:    srlv $6, $6, $7
+; MIPS32R2-NEXT:    lw $8, 0($4)
+; MIPS32R2-NEXT:    sllv $8, $8, $1
+; MIPS32R2-NEXT:    srl $2, $2, 1
+; MIPS32R2-NEXT:    srlv $2, $2, $7
+; MIPS32R2-NEXT:    or $2, $8, $2
+; MIPS32R2-NEXT:    or $3, $3, $6
 ; MIPS32R2-NEXT:    sllv $5, $5, $1
-; MIPS32R2-NEXT:    lwl $6, 12($4)
-; MIPS32R2-NEXT:    lwr $6, 15($4)
+; MIPS32R2-NEXT:    lw $6, 12($4)
 ; MIPS32R2-NEXT:    srl $4, $6, 1
-; MIPS32R2-NEXT:    srlv $4, $4, $8
+; MIPS32R2-NEXT:    srlv $4, $4, $7
 ; MIPS32R2-NEXT:    or $4, $5, $4
 ; MIPS32R2-NEXT:    sllv $5, $6, $1
 ; MIPS32R2-NEXT:    jr $ra
@@ -596,11 +558,12 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32R6-NEXT:    addiu $sp, $sp, -32
 ; MIPS32R6-NEXT:    .cfi_def_cfa_offset 32
 ; MIPS32R6-NEXT:    lw $1, 60($sp)
+; MIPS32R6-NEXT:    srl $2, $1, 3
 ; MIPS32R6-NEXT:    sw $7, 12($sp)
 ; MIPS32R6-NEXT:    sw $6, 8($sp)
 ; MIPS32R6-NEXT:    sw $5, 4($sp)
 ; MIPS32R6-NEXT:    sw $4, 0($sp)
-; MIPS32R6-NEXT:    ext $2, $1, 3, 4
+; MIPS32R6-NEXT:    andi $2, $2, 12
 ; MIPS32R6-NEXT:    addiu $3, $sp, 0
 ; MIPS32R6-NEXT:    addu $4, $3, $2
 ; MIPS32R6-NEXT:    sw $zero, 28($sp)
@@ -608,23 +571,22 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32R6-NEXT:    sw $zero, 20($sp)
 ; MIPS32R6-NEXT:    sw $zero, 16($sp)
 ; MIPS32R6-NEXT:    lw $5, 8($4)
-; MIPS32R6-NEXT:    srl $2, $5, 1
-; MIPS32R6-NEXT:    lw $3, 4($4)
-; MIPS32R6-NEXT:    andi $1, $1, 7
-; MIPS32R6-NEXT:    not $6, $1
-; MIPS32R6-NEXT:    sllv $7, $3, $1
-; MIPS32R6-NEXT:    srlv $6, $2, $6
-; MIPS32R6-NEXT:    lw $2, 0($4)
-; MIPS32R6-NEXT:    sllv $2, $2, $1
-; MIPS32R6-NEXT:    srl $3, $3, 1
-; MIPS32R6-NEXT:    xori $8, $1, 31
-; MIPS32R6-NEXT:    srlv $3, $3, $8
-; MIPS32R6-NEXT:    or $2, $2, $3
-; MIPS32R6-NEXT:    or $3, $7, $6
+; MIPS32R6-NEXT:    lw $2, 4($4)
+; MIPS32R6-NEXT:    sllv $3, $2, $1
+; MIPS32R6-NEXT:    srl $6, $5, 1
+; MIPS32R6-NEXT:    andi $7, $1, 31
+; MIPS32R6-NEXT:    xori $7, $7, 31
+; MIPS32R6-NEXT:    srlv $6, $6, $7
+; MIPS32R6-NEXT:    lw $8, 0($4)
+; MIPS32R6-NEXT:    sllv $8, $8, $1
+; MIPS32R6-NEXT:    srl $2, $2, 1
+; MIPS32R6-NEXT:    srlv $2, $2, $7
+; MIPS32R6-NEXT:    or $2, $8, $2
+; MIPS32R6-NEXT:    or $3, $3, $6
 ; MIPS32R6-NEXT:    sllv $5, $5, $1
 ; MIPS32R6-NEXT:    lw $6, 12($4)
 ; MIPS32R6-NEXT:    srl $4, $6, 1
-; MIPS32R6-NEXT:    srlv $4, $4, $8
+; MIPS32R6-NEXT:    srlv $4, $4, $7
 ; MIPS32R6-NEXT:    or $4, $5, $4
 ; MIPS32R6-NEXT:    sllv $5, $6, $1
 ; MIPS32R6-NEXT:    jr $ra
@@ -722,47 +684,32 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MMR3-NEXT:    .cfi_offset 17, -4
 ; MMR3-NEXT:    .cfi_offset 16, -8
 ; MMR3-NEXT:    li16 $2, 0
-; MMR3-NEXT:    swl $2, 28($sp)
-; MMR3-NEXT:    swl $2, 24($sp)
-; MMR3-NEXT:    swl $2, 20($sp)
-; MMR3-NEXT:    swl $2, 16($sp)
-; MMR3-NEXT:    swl $7, 12($sp)
-; MMR3-NEXT:    swl $6, 8($sp)
-; MMR3-NEXT:    swl $5, 4($sp)
-; MMR3-NEXT:    swl $4, 0($sp)
-; MMR3-NEXT:    swr $2, 31($sp)
-; MMR3-NEXT:    swr $2, 27($sp)
-; MMR3-NEXT:    swr $2, 23($sp)
-; MMR3-NEXT:    swr $2, 19($sp)
-; MMR3-NEXT:    swr $7, 15($sp)
-; MMR3-NEXT:    swr $6, 11($sp)
-; MMR3-NEXT:    swr $5, 7($sp)
-; MMR3-NEXT:    swr $4, 3($sp)
+; MMR3-NEXT:    sw $2, 28($sp)
+; MMR3-NEXT:    sw $2, 24($sp)
+; MMR3-NEXT:    sw $2, 20($sp)
+; MMR3-NEXT:    sw $2, 16($sp)
+; MMR3-NEXT:    swp $6, 8($sp)
+; MMR3-NEXT:    swp $4, 0($sp)
 ; MMR3-NEXT:    lw $2, 68($sp)
-; MMR3-NEXT:    ext $3, $2, 3, 4
+; MMR3-NEXT:    srl16 $3, $2, 3
+; MMR3-NEXT:    andi $3, $3, 12
 ; MMR3-NEXT:    addiur1sp $4, 0
 ; MMR3-NEXT:    addu16 $4, $4, $3
-; MMR3-NEXT:    lwl $6, 8($4)
-; MMR3-NEXT:    lwr $6, 11($4)
-; MMR3-NEXT:    srl16 $3, $6, 1
-; MMR3-NEXT:    lwl $7, 4($4)
-; MMR3-NEXT:    lwr $7, 7($4)
-; MMR3-NEXT:    andi16 $5, $2, 7
-; MMR3-NEXT:    not16 $2, $5
-; MMR3-NEXT:    andi16 $2, $2, 31
+; MMR3-NEXT:    lw16 $6, 8($4)
+; MMR3-NEXT:    lw16 $7, 4($4)
+; MMR3-NEXT:    andi16 $5, $2, 31
 ; MMR3-NEXT:    sllv $16, $7, $5
-; MMR3-NEXT:    srlv $3, $3, $2
-; MMR3-NEXT:    lwl $1, 0($4)
-; MMR3-NEXT:    lwr $1, 3($4)
-; MMR3-NEXT:    sllv $17, $1, $5
-; MMR3-NEXT:    srl16 $2, $7, 1
+; MMR3-NEXT:    srl16 $2, $6, 1
 ; MMR3-NEXT:    xori $1, $5, 31
+; MMR3-NEXT:    srlv $3, $2, $1
+; MMR3-NEXT:    lw16 $2, 0($4)
+; MMR3-NEXT:    sllv $17, $2, $5
+; MMR3-NEXT:    srl16 $2, $7, 1
 ; MMR3-NEXT:    srlv $2, $2, $1
 ; MMR3-NEXT:    or16 $2, $17
 ; MMR3-NEXT:    or16 $3, $16
 ; MMR3-NEXT:    sllv $6, $6, $5
-; MMR3-NEXT:    lwl $7, 12($4)
-; MMR3-NEXT:    lwr $7, 15($4)
+; MMR3-NEXT:    lw16 $7, 12($4)
 ; MMR3-NEXT:    srl16 $4, $7, 1
 ; MMR3-NEXT:    srlv $4, $4, $1
 ; MMR3-NEXT:    or16 $4, $6
@@ -785,30 +732,29 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MMR6-NEXT:    sw $5, 4($sp)
 ; MMR6-NEXT:    sw $4, 0($sp)
 ; MMR6-NEXT:    lw $2, 60($sp)
-; MMR6-NEXT:    ext $3, $2, 3, 4
+; MMR6-NEXT:    srl16 $3, $2, 3
+; MMR6-NEXT:    andi $3, $3, 12
 ; MMR6-NEXT:    addiu $4, $sp, 0
 ; MMR6-NEXT:    addu16 $4, $4, $3
-; MMR6-NEXT:    lw16 $6, 8($4)
-; MMR6-NEXT:    srl16 $3, $6, 1
-; MMR6-NEXT:    lw16 $7, 4($4)
-; MMR6-NEXT:    andi16 $5, $2, 7
-; MMR6-NEXT:    not16 $2, $5
-; MMR6-NEXT:    andi16 $2, $2, 31
-; MMR6-NEXT:    sllv $1, $7, $5
-; MMR6-NEXT:    srlv $3, $3, $2
+; MMR6-NEXT:    lw16 $5, 8($4)
+; MMR6-NEXT:    lw16 $3, 4($4)
+; MMR6-NEXT:    andi16 $6, $2, 31
+; MMR6-NEXT:    sllv $1, $3, $6
+; MMR6-NEXT:    srl16 $2, $5, 1
+; MMR6-NEXT:    xori $7, $6, 31
+; MMR6-NEXT:    srlv $8, $2, $7
 ; MMR6-NEXT:    lw16 $2, 0($4)
-; MMR6-NEXT:    sllv $2, $2, $5
-; MMR6-NEXT:    srl16 $7, $7, 1
-; MMR6-NEXT:    xori $8, $5, 31
-; MMR6-NEXT:    srlv $7, $7, $8
-; MMR6-NEXT:    or $2, $2, $7
-; MMR6-NEXT:    or $3, $1, $3
-; MMR6-NEXT:    sllv $1, $6, $5
-; MMR6-NEXT:    lw16 $6, 12($4)
-; MMR6-NEXT:    srl16 $4, $6, 1
-; MMR6-NEXT:    srlv $4, $4, $8
+; MMR6-NEXT:    sllv $2, $2, $6
+; MMR6-NEXT:    srl16 $3, $3, 1
+; MMR6-NEXT:    srlv $3, $3, $7
+; MMR6-NEXT:    or $2, $2, $3
+; MMR6-NEXT:    or $3, $1, $8
+; MMR6-NEXT:    sllv $1, $5, $6
+; MMR6-NEXT:    lw16 $5, 12($4)
+; MMR6-NEXT:    srl16 $4, $5, 1
+; MMR6-NEXT:    srlv $4, $4, $7
 ; MMR6-NEXT:    or $4, $1, $4
-; MMR6-NEXT:    sllv $5, $6, $5
+; MMR6-NEXT:    sllv $5, $5, $6
 ; MMR6-NEXT:    addiu $sp, $sp, 32
 ; MMR6-NEXT:    jrc $ra
 entry:

diff  --git a/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll b/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
index c48361e0a8035c..72de456cba395b 100644
--- a/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
+++ b/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
@@ -8,58 +8,52 @@ define void @foo1(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
 ; CHECK-LABEL: foo1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    stwu 1, -64(1)
-; CHECK-NEXT:    stw 28, 48(1) # 4-byte Folded Spill
-; CHECK-NEXT:    li 8, 2048
 ; CHECK-NEXT:    stw 29, 52(1) # 4-byte Folded Spill
-; CHECK-NEXT:    li 6, 0
+; CHECK-NEXT:    li 7, 2048
 ; CHECK-NEXT:    stw 30, 56(1) # 4-byte Folded Spill
-; CHECK-NEXT:    li 7, 7
-; CHECK-NEXT:    mtctr 8
-; CHECK-NEXT:    addi 8, 1, 16
+; CHECK-NEXT:    li 6, 0
+; CHECK-NEXT:    mtctr 7
+; CHECK-NEXT:    addi 7, 1, 16
 ; CHECK-NEXT:  .LBB0_1: # %for.body
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    lwz 9, 0(4)
-; CHECK-NEXT:    lwz 10, 4(4)
-; CHECK-NEXT:    lwz 11, 8(4)
-; CHECK-NEXT:    lwz 12, 12(4)
-; CHECK-NEXT:    lwz 0, 12(5)
+; CHECK-NEXT:    lwz 8, 0(4)
+; CHECK-NEXT:    lwz 9, 4(4)
+; CHECK-NEXT:    lwz 10, 8(4)
+; CHECK-NEXT:    lwz 11, 12(4)
+; CHECK-NEXT:    lwz 12, 12(5)
 ; CHECK-NEXT:    stw 6, 44(1)
 ; CHECK-NEXT:    stw 6, 40(1)
 ; CHECK-NEXT:    stw 6, 36(1)
 ; CHECK-NEXT:    stw 6, 32(1)
-; CHECK-NEXT:    stw 12, 28(1)
-; CHECK-NEXT:    clrlwi 12, 0, 29
-; CHECK-NEXT:    stw 11, 24(1)
-; CHECK-NEXT:    nand 11, 0, 7
-; CHECK-NEXT:    stw 10, 20(1)
-; CHECK-NEXT:    subfic 29, 12, 32
-; CHECK-NEXT:    stw 9, 16(1)
-; CHECK-NEXT:    rlwinm 9, 0, 29, 28, 31
-; CHECK-NEXT:    lwzux 10, 9, 8
-; CHECK-NEXT:    clrlwi 11, 11, 27
-; CHECK-NEXT:    lwz 0, 8(9)
-; CHECK-NEXT:    slw 10, 10, 12
-; CHECK-NEXT:    lwz 30, 4(9)
-; CHECK-NEXT:    lwz 9, 12(9)
-; CHECK-NEXT:    slw 28, 30, 12
-; CHECK-NEXT:    srw 30, 30, 29
-; CHECK-NEXT:    srw 29, 9, 29
-; CHECK-NEXT:    slw 9, 9, 12
-; CHECK-NEXT:    slw 12, 0, 12
-; CHECK-NEXT:    srwi 0, 0, 1
-; CHECK-NEXT:    stw 9, 12(3)
-; CHECK-NEXT:    or 9, 12, 29
-; CHECK-NEXT:    srw 11, 0, 11
-; CHECK-NEXT:    stw 9, 8(3)
-; CHECK-NEXT:    or 9, 10, 30
-; CHECK-NEXT:    stw 9, 0(3)
-; CHECK-NEXT:    or 9, 28, 11
-; CHECK-NEXT:    stw 9, 4(3)
+; CHECK-NEXT:    stw 11, 28(1)
+; CHECK-NEXT:    stw 10, 24(1)
+; CHECK-NEXT:    clrlwi 10, 12, 27
+; CHECK-NEXT:    stw 9, 20(1)
+; CHECK-NEXT:    stw 8, 16(1)
+; CHECK-NEXT:    rlwinm 8, 12, 29, 28, 29
+; CHECK-NEXT:    lwzux 9, 8, 7
+; CHECK-NEXT:    subfic 12, 10, 32
+; CHECK-NEXT:    lwz 11, 8(8)
+; CHECK-NEXT:    slw 9, 9, 10
+; CHECK-NEXT:    lwz 0, 4(8)
+; CHECK-NEXT:    lwz 8, 12(8)
+; CHECK-NEXT:    srw 30, 11, 12
+; CHECK-NEXT:    slw 29, 0, 10
+; CHECK-NEXT:    srw 0, 0, 12
+; CHECK-NEXT:    srw 12, 8, 12
+; CHECK-NEXT:    slw 11, 11, 10
+; CHECK-NEXT:    slw 8, 8, 10
+; CHECK-NEXT:    stw 8, 12(3)
+; CHECK-NEXT:    or 8, 11, 12
+; CHECK-NEXT:    stw 8, 8(3)
+; CHECK-NEXT:    or 8, 9, 0
+; CHECK-NEXT:    stw 8, 0(3)
+; CHECK-NEXT:    or 8, 29, 30
+; CHECK-NEXT:    stw 8, 4(3)
 ; CHECK-NEXT:    bdnz .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %for.end
 ; CHECK-NEXT:    lwz 30, 56(1) # 4-byte Folded Reload
 ; CHECK-NEXT:    lwz 29, 52(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 28, 48(1) # 4-byte Folded Reload
 ; CHECK-NEXT:    addi 1, 1, 64
 ; CHECK-NEXT:    blr
 entry:
@@ -83,59 +77,53 @@ for.end:                                          ; preds = %for.body
 define void @foo2(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
 ; CHECK-LABEL: foo2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    stwu 1, -64(1)
-; CHECK-NEXT:    stw 29, 52(1) # 4-byte Folded Spill
-; CHECK-NEXT:    li 7, 2048
-; CHECK-NEXT:    stw 30, 56(1) # 4-byte Folded Spill
-; CHECK-NEXT:    li 6, 7
-; CHECK-NEXT:    mtctr 7
-; CHECK-NEXT:    addi 7, 1, 36
+; CHECK-NEXT:    stwu 1, -48(1)
+; CHECK-NEXT:    stw 30, 40(1) # 4-byte Folded Spill
+; CHECK-NEXT:    li 6, 2048
+; CHECK-NEXT:    mtctr 6
+; CHECK-NEXT:    addi 6, 1, 24
 ; CHECK-NEXT:  .LBB1_1: # %for.body
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    lwz 8, 0(4)
-; CHECK-NEXT:    lwz 10, 8(4)
-; CHECK-NEXT:    lwz 12, 12(5)
-; CHECK-NEXT:    lwz 9, 4(4)
-; CHECK-NEXT:    lwz 11, 12(4)
-; CHECK-NEXT:    stw 10, 44(1)
-; CHECK-NEXT:    rlwinm 10, 12, 29, 28, 31
-; CHECK-NEXT:    stw 8, 36(1)
-; CHECK-NEXT:    srawi 8, 8, 31
-; CHECK-NEXT:    stw 11, 48(1)
-; CHECK-NEXT:    clrlwi 11, 12, 29
-; CHECK-NEXT:    stw 9, 40(1)
-; CHECK-NEXT:    nand 9, 12, 6
-; CHECK-NEXT:    stw 8, 32(1)
-; CHECK-NEXT:    subfic 30, 11, 32
+; CHECK-NEXT:    lwz 7, 0(4)
+; CHECK-NEXT:    lwz 8, 4(4)
+; CHECK-NEXT:    lwz 11, 12(5)
+; CHECK-NEXT:    lwz 9, 8(4)
+; CHECK-NEXT:    lwz 10, 12(4)
 ; CHECK-NEXT:    stw 8, 28(1)
-; CHECK-NEXT:    clrlwi 9, 9, 27
-; CHECK-NEXT:    stw 8, 24(1)
-; CHECK-NEXT:    stw 8, 20(1)
-; CHECK-NEXT:    sub 8, 7, 10
-; CHECK-NEXT:    lwz 10, 4(8)
-; CHECK-NEXT:    lwz 12, 8(8)
-; CHECK-NEXT:    lwz 0, 0(8)
-; CHECK-NEXT:    lwz 8, 12(8)
-; CHECK-NEXT:    srw 29, 12, 11
-; CHECK-NEXT:    slw 12, 12, 30
-; CHECK-NEXT:    slw 30, 0, 30
-; CHECK-NEXT:    srw 8, 8, 11
-; CHECK-NEXT:    sraw 0, 0, 11
-; CHECK-NEXT:    srw 11, 10, 11
-; CHECK-NEXT:    slwi 10, 10, 1
-; CHECK-NEXT:    or 8, 12, 8
-; CHECK-NEXT:    slw 9, 10, 9
-; CHECK-NEXT:    stw 8, 12(3)
-; CHECK-NEXT:    or 8, 30, 11
-; CHECK-NEXT:    stw 8, 4(3)
-; CHECK-NEXT:    or 8, 29, 9
-; CHECK-NEXT:    stw 0, 0(3)
-; CHECK-NEXT:    stw 8, 8(3)
+; CHECK-NEXT:    rlwinm 8, 11, 29, 28, 29
+; CHECK-NEXT:    stw 7, 24(1)
+; CHECK-NEXT:    srawi 7, 7, 31
+; CHECK-NEXT:    stw 10, 36(1)
+; CHECK-NEXT:    clrlwi 10, 11, 27
+; CHECK-NEXT:    stw 9, 32(1)
+; CHECK-NEXT:    subfic 12, 10, 32
+; CHECK-NEXT:    stw 7, 20(1)
+; CHECK-NEXT:    stw 7, 16(1)
+; CHECK-NEXT:    stw 7, 12(1)
+; CHECK-NEXT:    stw 7, 8(1)
+; CHECK-NEXT:    sub 7, 6, 8
+; CHECK-NEXT:    lwz 8, 4(7)
+; CHECK-NEXT:    lwz 9, 0(7)
+; CHECK-NEXT:    lwz 11, 12(7)
+; CHECK-NEXT:    srw 0, 8, 10
+; CHECK-NEXT:    lwz 7, 8(7)
+; CHECK-NEXT:    slw 30, 9, 12
+; CHECK-NEXT:    slw 8, 8, 12
+; CHECK-NEXT:    srw 11, 11, 10
+; CHECK-NEXT:    slw 12, 7, 12
+; CHECK-NEXT:    srw 7, 7, 10
+; CHECK-NEXT:    or 7, 8, 7
+; CHECK-NEXT:    stw 7, 8(3)
+; CHECK-NEXT:    or 7, 12, 11
+; CHECK-NEXT:    sraw 9, 9, 10
+; CHECK-NEXT:    stw 7, 12(3)
+; CHECK-NEXT:    or 7, 30, 0
+; CHECK-NEXT:    stw 9, 0(3)
+; CHECK-NEXT:    stw 7, 4(3)
 ; CHECK-NEXT:    bdnz .LBB1_1
 ; CHECK-NEXT:  # %bb.2: # %for.end
-; CHECK-NEXT:    lwz 30, 56(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 29, 52(1) # 4-byte Folded Reload
-; CHECK-NEXT:    addi 1, 1, 64
+; CHECK-NEXT:    lwz 30, 40(1) # 4-byte Folded Reload
+; CHECK-NEXT:    addi 1, 1, 48
 ; CHECK-NEXT:    blr
 entry:
   br label %for.body
@@ -159,59 +147,53 @@ define void @foo3(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
 ; CHECK-LABEL: foo3:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    stwu 1, -64(1)
-; CHECK-NEXT:    stw 28, 48(1) # 4-byte Folded Spill
-; CHECK-NEXT:    li 8, 2048
 ; CHECK-NEXT:    stw 29, 52(1) # 4-byte Folded Spill
-; CHECK-NEXT:    li 6, 0
+; CHECK-NEXT:    li 7, 2048
 ; CHECK-NEXT:    stw 30, 56(1) # 4-byte Folded Spill
-; CHECK-NEXT:    li 7, 7
-; CHECK-NEXT:    mtctr 8
-; CHECK-NEXT:    addi 8, 1, 32
+; CHECK-NEXT:    li 6, 0
+; CHECK-NEXT:    mtctr 7
+; CHECK-NEXT:    addi 7, 1, 32
 ; CHECK-NEXT:  .LBB2_1: # %for.body
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    lwz 10, 4(4)
-; CHECK-NEXT:    lwz 0, 12(5)
-; CHECK-NEXT:    lwz 9, 0(4)
-; CHECK-NEXT:    lwz 11, 8(4)
-; CHECK-NEXT:    lwz 12, 12(4)
-; CHECK-NEXT:    stw 10, 36(1)
-; CHECK-NEXT:    rlwinm 10, 0, 29, 28, 31
+; CHECK-NEXT:    lwz 8, 0(4)
+; CHECK-NEXT:    lwz 12, 12(5)
+; CHECK-NEXT:    lwz 9, 4(4)
+; CHECK-NEXT:    lwz 10, 8(4)
+; CHECK-NEXT:    lwz 11, 12(4)
+; CHECK-NEXT:    stw 8, 32(1)
+; CHECK-NEXT:    rlwinm 8, 12, 29, 28, 29
 ; CHECK-NEXT:    stw 6, 28(1)
-; CHECK-NEXT:    sub 10, 8, 10
+; CHECK-NEXT:    sub 8, 7, 8
 ; CHECK-NEXT:    stw 6, 24(1)
 ; CHECK-NEXT:    stw 6, 20(1)
 ; CHECK-NEXT:    stw 6, 16(1)
-; CHECK-NEXT:    stw 12, 44(1)
-; CHECK-NEXT:    clrlwi 12, 0, 29
-; CHECK-NEXT:    stw 11, 40(1)
-; CHECK-NEXT:    subfic 29, 12, 32
-; CHECK-NEXT:    stw 9, 32(1)
-; CHECK-NEXT:    nand 9, 0, 7
-; CHECK-NEXT:    lwz 11, 4(10)
-; CHECK-NEXT:    clrlwi 9, 9, 27
-; CHECK-NEXT:    lwz 0, 8(10)
-; CHECK-NEXT:    lwz 30, 0(10)
-; CHECK-NEXT:    lwz 10, 12(10)
-; CHECK-NEXT:    srw 28, 0, 12
-; CHECK-NEXT:    slw 0, 0, 29
-; CHECK-NEXT:    slw 29, 30, 29
-; CHECK-NEXT:    srw 10, 10, 12
-; CHECK-NEXT:    srw 30, 30, 12
-; CHECK-NEXT:    srw 12, 11, 12
-; CHECK-NEXT:    slwi 11, 11, 1
-; CHECK-NEXT:    slw 9, 11, 9
-; CHECK-NEXT:    or 10, 0, 10
-; CHECK-NEXT:    stw 10, 12(3)
-; CHECK-NEXT:    or 10, 29, 12
-; CHECK-NEXT:    or 9, 28, 9
-; CHECK-NEXT:    stw 30, 0(3)
-; CHECK-NEXT:    stw 10, 4(3)
-; CHECK-NEXT:    stw 9, 8(3)
+; CHECK-NEXT:    stw 11, 44(1)
+; CHECK-NEXT:    clrlwi 11, 12, 27
+; CHECK-NEXT:    stw 10, 40(1)
+; CHECK-NEXT:    subfic 0, 11, 32
+; CHECK-NEXT:    stw 9, 36(1)
+; CHECK-NEXT:    lwz 9, 4(8)
+; CHECK-NEXT:    lwz 10, 0(8)
+; CHECK-NEXT:    lwz 12, 12(8)
+; CHECK-NEXT:    srw 30, 9, 11
+; CHECK-NEXT:    lwz 8, 8(8)
+; CHECK-NEXT:    slw 29, 10, 0
+; CHECK-NEXT:    slw 9, 9, 0
+; CHECK-NEXT:    srw 12, 12, 11
+; CHECK-NEXT:    slw 0, 8, 0
+; CHECK-NEXT:    srw 8, 8, 11
+; CHECK-NEXT:    or 8, 9, 8
+; CHECK-NEXT:    stw 8, 8(3)
+; CHECK-NEXT:    or 8, 0, 12
+; CHECK-NEXT:    srw 10, 10, 11
+; CHECK-NEXT:    stw 8, 12(3)
+; CHECK-NEXT:    or 8, 29, 30
+; CHECK-NEXT:    stw 10, 0(3)
+; CHECK-NEXT:    stw 8, 4(3)
 ; CHECK-NEXT:    bdnz .LBB2_1
 ; CHECK-NEXT:  # %bb.2: # %for.end
 ; CHECK-NEXT:    lwz 30, 56(1) # 4-byte Folded Reload
 ; CHECK-NEXT:    lwz 29, 52(1) # 4-byte Folded Reload
-; CHECK-NEXT:    lwz 28, 48(1) # 4-byte Folded Reload
 ; CHECK-NEXT:    addi 1, 1, 64
 ; CHECK-NEXT:    blr
 entry:

diff  --git a/llvm/test/CodeGen/PowerPC/pr59074.ll b/llvm/test/CodeGen/PowerPC/pr59074.ll
index 3e328c6ad9f0ba..d3ca1139b4fd11 100644
--- a/llvm/test/CodeGen/PowerPC/pr59074.ll
+++ b/llvm/test/CodeGen/PowerPC/pr59074.ll
@@ -32,37 +32,36 @@ define void @pr59074(ptr %0) {
 ; LE32-NEXT:    li 7, 0
 ; LE32-NEXT:    li 8, 12
 ; LE32-NEXT:    xxswapd 0, 0
+; LE32-NEXT:    rlwimi 5, 6, 0, 30, 28
 ; LE32-NEXT:    addi 4, 4, -12
-; LE32-NEXT:    rlwinm 9, 4, 29, 28, 31
-; LE32-NEXT:    stxvd2x 0, 6, 5
+; LE32-NEXT:    rlwinm 9, 4, 29, 28, 29
+; LE32-NEXT:    stxvd2x 0, 0, 5
 ; LE32-NEXT:    stw 7, 44(1)
 ; LE32-NEXT:    stw 7, 40(1)
 ; LE32-NEXT:    stw 7, 36(1)
 ; LE32-NEXT:    stw 8, 16(1)
+; LE32-NEXT:    clrlwi 4, 4, 27
 ; LE32-NEXT:    lwzux 5, 9, 6
-; LE32-NEXT:    li 6, 7
-; LE32-NEXT:    lwz 7, 8(9)
-; LE32-NEXT:    nand 6, 4, 6
-; LE32-NEXT:    lwz 8, 4(9)
-; LE32-NEXT:    clrlwi 4, 4, 29
-; LE32-NEXT:    lwz 9, 12(9)
-; LE32-NEXT:    clrlwi 6, 6, 27
+; LE32-NEXT:    lwz 6, 8(9)
+; LE32-NEXT:    lwz 7, 4(9)
+; LE32-NEXT:    lwz 8, 12(9)
+; LE32-NEXT:    xori 9, 4, 31
 ; LE32-NEXT:    subfic 11, 4, 32
 ; LE32-NEXT:    srw 5, 5, 4
-; LE32-NEXT:    slwi 10, 7, 1
-; LE32-NEXT:    srw 7, 7, 4
-; LE32-NEXT:    slw 6, 10, 6
-; LE32-NEXT:    srw 10, 8, 4
-; LE32-NEXT:    slw 8, 8, 11
-; LE32-NEXT:    slw 11, 9, 11
-; LE32-NEXT:    srw 4, 9, 4
-; LE32-NEXT:    or 5, 8, 5
-; LE32-NEXT:    or 7, 11, 7
-; LE32-NEXT:    or 6, 10, 6
+; LE32-NEXT:    slwi 10, 6, 1
+; LE32-NEXT:    srw 6, 6, 4
+; LE32-NEXT:    slw 9, 10, 9
+; LE32-NEXT:    srw 10, 7, 4
+; LE32-NEXT:    slw 7, 7, 11
+; LE32-NEXT:    slw 11, 8, 11
+; LE32-NEXT:    srw 4, 8, 4
+; LE32-NEXT:    or 5, 7, 5
+; LE32-NEXT:    or 6, 11, 6
+; LE32-NEXT:    or 7, 10, 9
 ; LE32-NEXT:    stw 4, 12(3)
-; LE32-NEXT:    stw 7, 8(3)
+; LE32-NEXT:    stw 6, 8(3)
 ; LE32-NEXT:    stw 5, 0(3)
-; LE32-NEXT:    stw 6, 4(3)
+; LE32-NEXT:    stw 7, 4(3)
 ; LE32-NEXT:    addi 1, 1, 80
 ; LE32-NEXT:    blr
 ;
@@ -89,37 +88,33 @@ define void @pr59074(ptr %0) {
 ; BE32-NEXT:    li 6, 12
 ; BE32-NEXT:    li 7, 0
 ; BE32-NEXT:    addi 8, 1, -48
-; BE32-NEXT:    li 10, 7
 ; BE32-NEXT:    stxvw4x 0, 0, 5
-; BE32-NEXT:    addi 4, 4, -12
 ; BE32-NEXT:    stw 6, -36(1)
+; BE32-NEXT:    addi 4, 4, -12
 ; BE32-NEXT:    stw 7, -40(1)
 ; BE32-NEXT:    stw 7, -44(1)
-; BE32-NEXT:    rlwinm 9, 4, 29, 28, 31
 ; BE32-NEXT:    stw 7, -48(1)
+; BE32-NEXT:    rlwinm 9, 4, 29, 28, 29
+; BE32-NEXT:    clrlwi 4, 4, 27
 ; BE32-NEXT:    sub 5, 8, 9
-; BE32-NEXT:    nand 6, 4, 10
-; BE32-NEXT:    clrlwi 4, 4, 29
-; BE32-NEXT:    clrlwi 6, 6, 27
-; BE32-NEXT:    lwz 7, 4(5)
-; BE32-NEXT:    lwz 8, 8(5)
-; BE32-NEXT:    lwz 9, 0(5)
-; BE32-NEXT:    lwz 5, 12(5)
-; BE32-NEXT:    slwi 10, 7, 1
-; BE32-NEXT:    srw 11, 8, 4
-; BE32-NEXT:    srw 7, 7, 4
-; BE32-NEXT:    srw 5, 5, 4
-; BE32-NEXT:    slw 6, 10, 6
+; BE32-NEXT:    lwz 6, 4(5)
+; BE32-NEXT:    lwz 7, 0(5)
+; BE32-NEXT:    lwz 8, 12(5)
+; BE32-NEXT:    lwz 5, 8(5)
 ; BE32-NEXT:    subfic 10, 4, 32
-; BE32-NEXT:    srw 4, 9, 4
-; BE32-NEXT:    slw 8, 8, 10
-; BE32-NEXT:    slw 10, 9, 10
-; BE32-NEXT:    or 6, 11, 6
-; BE32-NEXT:    or 7, 10, 7
-; BE32-NEXT:    or 5, 8, 5
+; BE32-NEXT:    srw 9, 6, 4
+; BE32-NEXT:    slw 11, 7, 10
+; BE32-NEXT:    srw 8, 8, 4
+; BE32-NEXT:    slw 6, 6, 10
+; BE32-NEXT:    slw 10, 5, 10
+; BE32-NEXT:    srw 5, 5, 4
+; BE32-NEXT:    srw 4, 7, 4
+; BE32-NEXT:    or 7, 11, 9
+; BE32-NEXT:    or 8, 10, 8
+; BE32-NEXT:    or 5, 6, 5
 ; BE32-NEXT:    stw 4, 0(3)
-; BE32-NEXT:    stw 6, 8(3)
-; BE32-NEXT:    stw 5, 12(3)
+; BE32-NEXT:    stw 5, 8(3)
+; BE32-NEXT:    stw 8, 12(3)
 ; BE32-NEXT:    stw 7, 4(3)
 ; BE32-NEXT:    blr
 entry:

diff  --git a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll
index f6fdb4ae207947..4f1b7bdc8b552a 100644
--- a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -233,9 +233,96 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    lwz 9, 8(3)
 ; LE-32BIT-NEXT:    lwz 3, 12(3)
 ; LE-32BIT-NEXT:    lwz 4, 12(4)
+; LE-32BIT-NEXT:    stw 6, 28(1)
+; LE-32BIT-NEXT:    stw 6, 24(1)
+; LE-32BIT-NEXT:    stw 6, 20(1)
+; LE-32BIT-NEXT:    stw 6, 16(1)
+; LE-32BIT-NEXT:    rlwinm 6, 4, 0, 28, 29
+; LE-32BIT-NEXT:    stw 3, 44(1)
+; LE-32BIT-NEXT:    addi 3, 1, 32
+; LE-32BIT-NEXT:    stw 9, 40(1)
+; LE-32BIT-NEXT:    sub 3, 3, 6
+; LE-32BIT-NEXT:    stw 8, 36(1)
+; LE-32BIT-NEXT:    rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT:    stw 7, 32(1)
+; LE-32BIT-NEXT:    subfic 9, 4, 32
+; LE-32BIT-NEXT:    lwz 6, 4(3)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    lwz 8, 12(3)
+; LE-32BIT-NEXT:    srw 10, 6, 4
+; LE-32BIT-NEXT:    lwz 3, 8(3)
+; LE-32BIT-NEXT:    slw 11, 7, 9
+; LE-32BIT-NEXT:    slw 6, 6, 9
+; LE-32BIT-NEXT:    srw 8, 8, 4
+; LE-32BIT-NEXT:    slw 9, 3, 9
+; LE-32BIT-NEXT:    srw 3, 3, 4
+; LE-32BIT-NEXT:    or 3, 6, 3
+; LE-32BIT-NEXT:    stw 3, 8(5)
+; LE-32BIT-NEXT:    or 3, 9, 8
+; LE-32BIT-NEXT:    srw 4, 7, 4
+; LE-32BIT-NEXT:    stw 3, 12(5)
+; LE-32BIT-NEXT:    or 3, 11, 10
+; LE-32BIT-NEXT:    stw 4, 0(5)
+; LE-32BIT-NEXT:    stw 3, 4(5)
+; LE-32BIT-NEXT:    addi 1, 1, 48
+; LE-32BIT-NEXT:    blr
+  %src = load i128, ptr %src.ptr, align 1
+  %byteOff = load i128, ptr %byteOff.ptr, align 1
+  %bitOff = shl i128 %byteOff, 3
+  %res = lshr i128 %src, %bitOff
+  store i128 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: lshr_16bytes_wordOff:
+; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    lwz 4, 0(4)
+; LE-64BIT-NEXT:    ld 6, 8(3)
+; LE-64BIT-NEXT:    ld 3, 0(3)
+; LE-64BIT-NEXT:    slwi 4, 4, 5
+; LE-64BIT-NEXT:    subfic 7, 4, 64
+; LE-64BIT-NEXT:    srd 3, 3, 4
+; LE-64BIT-NEXT:    sld 7, 6, 7
+; LE-64BIT-NEXT:    or 3, 3, 7
+; LE-64BIT-NEXT:    addi 7, 4, -64
+; LE-64BIT-NEXT:    srd 4, 6, 4
+; LE-64BIT-NEXT:    srd 7, 6, 7
+; LE-64BIT-NEXT:    std 4, 8(5)
+; LE-64BIT-NEXT:    or 3, 3, 7
+; LE-64BIT-NEXT:    std 3, 0(5)
+; LE-64BIT-NEXT:    blr
+;
+; BE-LABEL: lshr_16bytes_wordOff:
+; BE:       # %bb.0:
+; BE-NEXT:    lwz 4, 12(4)
+; BE-NEXT:    ld 6, 0(3)
+; BE-NEXT:    ld 3, 8(3)
+; BE-NEXT:    slwi 4, 4, 5
+; BE-NEXT:    subfic 7, 4, 64
+; BE-NEXT:    srd 3, 3, 4
+; BE-NEXT:    sld 7, 6, 7
+; BE-NEXT:    addi 8, 4, -64
+; BE-NEXT:    or 3, 3, 7
+; BE-NEXT:    srd 7, 6, 8
+; BE-NEXT:    srd 4, 6, 4
+; BE-NEXT:    or 3, 3, 7
+; BE-NEXT:    std 4, 0(5)
+; BE-NEXT:    std 3, 8(5)
+; BE-NEXT:    blr
+;
+; LE-32BIT-LABEL: lshr_16bytes_wordOff:
+; LE-32BIT:       # %bb.0:
+; LE-32BIT-NEXT:    stwu 1, -48(1)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    li 6, 0
+; LE-32BIT-NEXT:    lwz 8, 4(3)
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 3, 12(3)
+; LE-32BIT-NEXT:    lwz 4, 12(4)
 ; LE-32BIT-NEXT:    stw 3, 44(1)
 ; LE-32BIT-NEXT:    addi 3, 1, 32
-; LE-32BIT-NEXT:    clrlwi 4, 4, 28
+; LE-32BIT-NEXT:    rlwinm 4, 4, 2, 28, 29
 ; LE-32BIT-NEXT:    stw 6, 28(1)
 ; LE-32BIT-NEXT:    sub 3, 3, 4
 ; LE-32BIT-NEXT:    stw 6, 24(1)
@@ -255,12 +342,13 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    addi 1, 1, 48
 ; LE-32BIT-NEXT:    blr
   %src = load i128, ptr %src.ptr, align 1
-  %byteOff = load i128, ptr %byteOff.ptr, align 1
-  %bitOff = shl i128 %byteOff, 3
+  %wordOff = load i128, ptr %wordOff.ptr, align 1
+  %bitOff = shl i128 %wordOff, 5
   %res = lshr i128 %src, %bitOff
   store i128 %res, ptr %dst, align 1
   ret void
 }
+
 define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-64BIT-LABEL: shl_16bytes:
 ; LE-64BIT:       # %bb.0:
@@ -309,7 +397,93 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    lwz 4, 12(4)
 ; LE-32BIT-NEXT:    stw 6, 44(1)
 ; LE-32BIT-NEXT:    stw 6, 40(1)
-; LE-32BIT-NEXT:    clrlwi 4, 4, 28
+; LE-32BIT-NEXT:    stw 6, 36(1)
+; LE-32BIT-NEXT:    stw 6, 32(1)
+; LE-32BIT-NEXT:    rlwinm 6, 4, 0, 28, 29
+; LE-32BIT-NEXT:    stw 3, 28(1)
+; LE-32BIT-NEXT:    addi 3, 1, 16
+; LE-32BIT-NEXT:    stw 9, 24(1)
+; LE-32BIT-NEXT:    rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT:    stw 8, 20(1)
+; LE-32BIT-NEXT:    subfic 8, 4, 32
+; LE-32BIT-NEXT:    stw 7, 16(1)
+; LE-32BIT-NEXT:    lwzux 3, 6, 3
+; LE-32BIT-NEXT:    lwz 9, 4(6)
+; LE-32BIT-NEXT:    slw 3, 3, 4
+; LE-32BIT-NEXT:    lwz 7, 8(6)
+; LE-32BIT-NEXT:    lwz 6, 12(6)
+; LE-32BIT-NEXT:    slw 11, 9, 4
+; LE-32BIT-NEXT:    srw 9, 9, 8
+; LE-32BIT-NEXT:    srw 10, 7, 8
+; LE-32BIT-NEXT:    srw 8, 6, 8
+; LE-32BIT-NEXT:    slw 7, 7, 4
+; LE-32BIT-NEXT:    slw 4, 6, 4
+; LE-32BIT-NEXT:    or 3, 3, 9
+; LE-32BIT-NEXT:    stw 4, 12(5)
+; LE-32BIT-NEXT:    or 4, 7, 8
+; LE-32BIT-NEXT:    stw 3, 0(5)
+; LE-32BIT-NEXT:    or 3, 11, 10
+; LE-32BIT-NEXT:    stw 4, 8(5)
+; LE-32BIT-NEXT:    stw 3, 4(5)
+; LE-32BIT-NEXT:    addi 1, 1, 48
+; LE-32BIT-NEXT:    blr
+  %src = load i128, ptr %src.ptr, align 1
+  %byteOff = load i128, ptr %byteOff.ptr, align 1
+  %bitOff = shl i128 %byteOff, 3
+  %res = shl i128 %src, %bitOff
+  store i128 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: shl_16bytes_wordOff:
+; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    lwz 4, 0(4)
+; LE-64BIT-NEXT:    ld 6, 0(3)
+; LE-64BIT-NEXT:    ld 3, 8(3)
+; LE-64BIT-NEXT:    slwi 4, 4, 5
+; LE-64BIT-NEXT:    subfic 7, 4, 64
+; LE-64BIT-NEXT:    sld 3, 3, 4
+; LE-64BIT-NEXT:    srd 7, 6, 7
+; LE-64BIT-NEXT:    or 3, 3, 7
+; LE-64BIT-NEXT:    addi 7, 4, -64
+; LE-64BIT-NEXT:    sld 4, 6, 4
+; LE-64BIT-NEXT:    sld 7, 6, 7
+; LE-64BIT-NEXT:    std 4, 0(5)
+; LE-64BIT-NEXT:    or 3, 3, 7
+; LE-64BIT-NEXT:    std 3, 8(5)
+; LE-64BIT-NEXT:    blr
+;
+; BE-LABEL: shl_16bytes_wordOff:
+; BE:       # %bb.0:
+; BE-NEXT:    lwz 4, 12(4)
+; BE-NEXT:    ld 6, 8(3)
+; BE-NEXT:    ld 3, 0(3)
+; BE-NEXT:    slwi 4, 4, 5
+; BE-NEXT:    subfic 7, 4, 64
+; BE-NEXT:    sld 3, 3, 4
+; BE-NEXT:    srd 7, 6, 7
+; BE-NEXT:    addi 8, 4, -64
+; BE-NEXT:    or 3, 3, 7
+; BE-NEXT:    sld 7, 6, 8
+; BE-NEXT:    sld 4, 6, 4
+; BE-NEXT:    or 3, 3, 7
+; BE-NEXT:    std 4, 8(5)
+; BE-NEXT:    std 3, 0(5)
+; BE-NEXT:    blr
+;
+; LE-32BIT-LABEL: shl_16bytes_wordOff:
+; LE-32BIT:       # %bb.0:
+; LE-32BIT-NEXT:    stwu 1, -48(1)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    li 6, 0
+; LE-32BIT-NEXT:    lwz 8, 4(3)
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 3, 12(3)
+; LE-32BIT-NEXT:    lwz 4, 12(4)
+; LE-32BIT-NEXT:    stw 6, 44(1)
+; LE-32BIT-NEXT:    stw 6, 40(1)
+; LE-32BIT-NEXT:    rlwinm 4, 4, 2, 28, 29
 ; LE-32BIT-NEXT:    stw 6, 36(1)
 ; LE-32BIT-NEXT:    stw 6, 32(1)
 ; LE-32BIT-NEXT:    stw 3, 28(1)
@@ -328,12 +502,13 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    addi 1, 1, 48
 ; LE-32BIT-NEXT:    blr
   %src = load i128, ptr %src.ptr, align 1
-  %byteOff = load i128, ptr %byteOff.ptr, align 1
-  %bitOff = shl i128 %byteOff, 3
+  %wordOff = load i128, ptr %wordOff.ptr, align 1
+  %bitOff = shl i128 %wordOff, 5
   %res = shl i128 %src, %bitOff
   store i128 %res, ptr %dst, align 1
   ret void
 }
+
 define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-64BIT-LABEL: ashr_16bytes:
 ; LE-64BIT:       # %bb.0:
@@ -361,17 +536,17 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; BE-NEXT:    slwi 4, 4, 3
 ; BE-NEXT:    addi 7, 4, -64
 ; BE-NEXT:    cmpwi 7, 1
-; BE-NEXT:    blt 0, .LBB8_2
+; BE-NEXT:    blt 0, .LBB10_2
 ; BE-NEXT:  # %bb.1:
 ; BE-NEXT:    srad 3, 6, 7
-; BE-NEXT:    b .LBB8_3
-; BE-NEXT:  .LBB8_2:
+; BE-NEXT:    b .LBB10_3
+; BE-NEXT:  .LBB10_2:
 ; BE-NEXT:    ld 3, 8(3)
 ; BE-NEXT:    subfic 7, 4, 64
 ; BE-NEXT:    sld 7, 6, 7
 ; BE-NEXT:    srd 3, 3, 4
 ; BE-NEXT:    or 3, 3, 7
-; BE-NEXT:  .LBB8_3:
+; BE-NEXT:  .LBB10_3:
 ; BE-NEXT:    srad 4, 6, 4
 ; BE-NEXT:    std 3, 8(5)
 ; BE-NEXT:    std 4, 0(5)
@@ -388,7 +563,100 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    lwz 4, 12(4)
 ; LE-32BIT-NEXT:    stw 3, 44(1)
 ; LE-32BIT-NEXT:    srawi 3, 7, 31
-; LE-32BIT-NEXT:    clrlwi 4, 4, 28
+; LE-32BIT-NEXT:    stw 7, 32(1)
+; LE-32BIT-NEXT:    rlwinm 7, 4, 0, 28, 29
+; LE-32BIT-NEXT:    stw 9, 40(1)
+; LE-32BIT-NEXT:    rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT:    stw 8, 36(1)
+; LE-32BIT-NEXT:    subfic 9, 4, 32
+; LE-32BIT-NEXT:    stw 3, 28(1)
+; LE-32BIT-NEXT:    stw 3, 24(1)
+; LE-32BIT-NEXT:    stw 3, 20(1)
+; LE-32BIT-NEXT:    stw 3, 16(1)
+; LE-32BIT-NEXT:    sub 3, 6, 7
+; LE-32BIT-NEXT:    lwz 6, 4(3)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    lwz 8, 12(3)
+; LE-32BIT-NEXT:    srw 10, 6, 4
+; LE-32BIT-NEXT:    lwz 3, 8(3)
+; LE-32BIT-NEXT:    slw 11, 7, 9
+; LE-32BIT-NEXT:    slw 6, 6, 9
+; LE-32BIT-NEXT:    srw 8, 8, 4
+; LE-32BIT-NEXT:    slw 9, 3, 9
+; LE-32BIT-NEXT:    srw 3, 3, 4
+; LE-32BIT-NEXT:    or 3, 6, 3
+; LE-32BIT-NEXT:    stw 3, 8(5)
+; LE-32BIT-NEXT:    or 3, 9, 8
+; LE-32BIT-NEXT:    sraw 4, 7, 4
+; LE-32BIT-NEXT:    stw 3, 12(5)
+; LE-32BIT-NEXT:    or 3, 11, 10
+; LE-32BIT-NEXT:    stw 4, 0(5)
+; LE-32BIT-NEXT:    stw 3, 4(5)
+; LE-32BIT-NEXT:    addi 1, 1, 48
+; LE-32BIT-NEXT:    blr
+  %src = load i128, ptr %src.ptr, align 1
+  %byteOff = load i128, ptr %byteOff.ptr, align 1
+  %bitOff = shl i128 %byteOff, 3
+  %res = ashr i128 %src, %bitOff
+  store i128 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: ashr_16bytes_wordOff:
+; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    lwz 4, 0(4)
+; LE-64BIT-NEXT:    ld 6, 8(3)
+; LE-64BIT-NEXT:    ld 3, 0(3)
+; LE-64BIT-NEXT:    slwi 4, 4, 5
+; LE-64BIT-NEXT:    subfic 7, 4, 64
+; LE-64BIT-NEXT:    srd 3, 3, 4
+; LE-64BIT-NEXT:    sld 7, 6, 7
+; LE-64BIT-NEXT:    or 3, 3, 7
+; LE-64BIT-NEXT:    addi 7, 4, -64
+; LE-64BIT-NEXT:    srad 4, 6, 4
+; LE-64BIT-NEXT:    cmpwi 7, 1
+; LE-64BIT-NEXT:    srad 8, 6, 7
+; LE-64BIT-NEXT:    std 4, 8(5)
+; LE-64BIT-NEXT:    isellt 3, 3, 8
+; LE-64BIT-NEXT:    std 3, 0(5)
+; LE-64BIT-NEXT:    blr
+;
+; BE-LABEL: ashr_16bytes_wordOff:
+; BE:       # %bb.0:
+; BE-NEXT:    lwz 4, 12(4)
+; BE-NEXT:    ld 6, 0(3)
+; BE-NEXT:    slwi 4, 4, 5
+; BE-NEXT:    addi 7, 4, -64
+; BE-NEXT:    cmpwi 7, 1
+; BE-NEXT:    blt 0, .LBB11_2
+; BE-NEXT:  # %bb.1:
+; BE-NEXT:    srad 3, 6, 7
+; BE-NEXT:    b .LBB11_3
+; BE-NEXT:  .LBB11_2:
+; BE-NEXT:    ld 3, 8(3)
+; BE-NEXT:    subfic 7, 4, 64
+; BE-NEXT:    sld 7, 6, 7
+; BE-NEXT:    srd 3, 3, 4
+; BE-NEXT:    or 3, 3, 7
+; BE-NEXT:  .LBB11_3:
+; BE-NEXT:    srad 4, 6, 4
+; BE-NEXT:    std 3, 8(5)
+; BE-NEXT:    std 4, 0(5)
+; BE-NEXT:    blr
+;
+; LE-32BIT-LABEL: ashr_16bytes_wordOff:
+; LE-32BIT:       # %bb.0:
+; LE-32BIT-NEXT:    stwu 1, -48(1)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    addi 6, 1, 32
+; LE-32BIT-NEXT:    lwz 8, 4(3)
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 3, 12(3)
+; LE-32BIT-NEXT:    lwz 4, 12(4)
+; LE-32BIT-NEXT:    stw 3, 44(1)
+; LE-32BIT-NEXT:    srawi 3, 7, 31
+; LE-32BIT-NEXT:    rlwinm 4, 4, 2, 28, 29
 ; LE-32BIT-NEXT:    stw 9, 40(1)
 ; LE-32BIT-NEXT:    stw 8, 36(1)
 ; LE-32BIT-NEXT:    stw 7, 32(1)
@@ -408,8 +676,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    addi 1, 1, 48
 ; LE-32BIT-NEXT:    blr
   %src = load i128, ptr %src.ptr, align 1
-  %byteOff = load i128, ptr %byteOff.ptr, align 1
-  %bitOff = shl i128 %byteOff, 3
+  %wordOff = load i128, ptr %wordOff.ptr, align 1
+  %bitOff = shl i128 %wordOff, 5
   %res = ashr i128 %src, %bitOff
   store i128 %res, ptr %dst, align 1
   ret void
@@ -422,20 +690,35 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-64BIT-NEXT:    lxvd2x 1, 0, 3
 ; LE-64BIT-NEXT:    xxlxor 2, 2, 2
 ; LE-64BIT-NEXT:    addi 7, 1, -64
+; LE-64BIT-NEXT:    li 8, 32
 ; LE-64BIT-NEXT:    lxvd2x 0, 3, 6
 ; LE-64BIT-NEXT:    lwz 3, 0(4)
 ; LE-64BIT-NEXT:    li 4, 48
 ; LE-64BIT-NEXT:    stxvd2x 2, 7, 4
-; LE-64BIT-NEXT:    li 4, 32
-; LE-64BIT-NEXT:    clrldi 3, 3, 59
-; LE-64BIT-NEXT:    stxvd2x 2, 7, 4
+; LE-64BIT-NEXT:    stxvd2x 2, 7, 8
+; LE-64BIT-NEXT:    rlwinm 4, 3, 0, 27, 28
+; LE-64BIT-NEXT:    rlwinm 3, 3, 3, 26, 28
 ; LE-64BIT-NEXT:    stxvd2x 0, 7, 6
 ; LE-64BIT-NEXT:    stxvd2x 1, 0, 7
-; LE-64BIT-NEXT:    lxvd2x 0, 7, 3
-; LE-64BIT-NEXT:    add 3, 7, 3
-; LE-64BIT-NEXT:    lxvd2x 1, 3, 6
-; LE-64BIT-NEXT:    stxvd2x 1, 5, 6
-; LE-64BIT-NEXT:    stxvd2x 0, 0, 5
+; LE-64BIT-NEXT:    ldux 6, 4, 7
+; LE-64BIT-NEXT:    subfic 7, 3, 64
+; LE-64BIT-NEXT:    ld 8, 8(4)
+; LE-64BIT-NEXT:    ld 9, 16(4)
+; LE-64BIT-NEXT:    ld 4, 24(4)
+; LE-64BIT-NEXT:    srd 6, 6, 3
+; LE-64BIT-NEXT:    sld 10, 8, 7
+; LE-64BIT-NEXT:    sld 11, 4, 7
+; LE-64BIT-NEXT:    srd 8, 8, 3
+; LE-64BIT-NEXT:    sld 7, 9, 7
+; LE-64BIT-NEXT:    or 6, 10, 6
+; LE-64BIT-NEXT:    srd 10, 9, 3
+; LE-64BIT-NEXT:    srd 3, 4, 3
+; LE-64BIT-NEXT:    or 7, 7, 8
+; LE-64BIT-NEXT:    std 3, 24(5)
+; LE-64BIT-NEXT:    or 3, 11, 10
+; LE-64BIT-NEXT:    std 7, 8(5)
+; LE-64BIT-NEXT:    std 6, 0(5)
+; LE-64BIT-NEXT:    std 3, 16(5)
 ; LE-64BIT-NEXT:    blr
 ;
 ; BE-LABEL: lshr_32bytes:
@@ -445,33 +728,44 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; BE-NEXT:    ld 8, 16(3)
 ; BE-NEXT:    ld 3, 24(3)
 ; BE-NEXT:    lwz 4, 28(4)
-; BE-NEXT:    addi 9, 1, -64
-; BE-NEXT:    li 10, 0
-; BE-NEXT:    std 10, 24(9)
-; BE-NEXT:    std 10, 16(9)
-; BE-NEXT:    std 10, 8(9)
-; BE-NEXT:    std 10, -64(1)
-; BE-NEXT:    std 3, 56(9)
-; BE-NEXT:    clrlwi 3, 4, 27
+; BE-NEXT:    li 9, 0
+; BE-NEXT:    addi 10, 1, -32
+; BE-NEXT:    std 9, -40(1)
+; BE-NEXT:    std 9, -48(1)
+; BE-NEXT:    std 9, -56(1)
+; BE-NEXT:    std 9, -64(1)
+; BE-NEXT:    std 3, -8(1)
+; BE-NEXT:    rlwinm 3, 4, 0, 27, 28
 ; BE-NEXT:    neg 3, 3
-; BE-NEXT:    std 8, 48(9)
-; BE-NEXT:    std 7, 40(9)
-; BE-NEXT:    std 6, 32(9)
+; BE-NEXT:    std 8, -16(1)
+; BE-NEXT:    std 7, -24(1)
+; BE-NEXT:    std 6, -32(1)
 ; BE-NEXT:    extsw 3, 3
-; BE-NEXT:    addi 4, 1, -32
-; BE-NEXT:    ldux 3, 4, 3
-; BE-NEXT:    ld 6, 8(4)
-; BE-NEXT:    ld 7, 24(4)
-; BE-NEXT:    ld 4, 16(4)
+; BE-NEXT:    ldux 3, 10, 3
+; BE-NEXT:    rlwinm 4, 4, 3, 26, 28
+; BE-NEXT:    subfic 9, 4, 64
+; BE-NEXT:    ld 6, 8(10)
+; BE-NEXT:    ld 7, 24(10)
+; BE-NEXT:    ld 8, 16(10)
+; BE-NEXT:    sld 10, 3, 9
+; BE-NEXT:    srd 3, 3, 4
 ; BE-NEXT:    std 3, 0(5)
-; BE-NEXT:    std 4, 16(5)
+; BE-NEXT:    srd 11, 6, 4
+; BE-NEXT:    srd 7, 7, 4
+; BE-NEXT:    sld 6, 6, 9
+; BE-NEXT:    sld 9, 8, 9
+; BE-NEXT:    srd 8, 8, 4
+; BE-NEXT:    or 10, 10, 11
+; BE-NEXT:    or 7, 9, 7
+; BE-NEXT:    or 6, 6, 8
+; BE-NEXT:    std 6, 16(5)
 ; BE-NEXT:    std 7, 24(5)
-; BE-NEXT:    std 6, 8(5)
+; BE-NEXT:    std 10, 8(5)
 ; BE-NEXT:    blr
 ;
 ; LE-32BIT-LABEL: lshr_32bytes:
 ; LE-32BIT:       # %bb.0:
-; LE-32BIT-NEXT:    stwu 1, -80(1)
+; LE-32BIT-NEXT:    stwu 1, -112(1)
 ; LE-32BIT-NEXT:    lwz 7, 0(3)
 ; LE-32BIT-NEXT:    li 6, 0
 ; LE-32BIT-NEXT:    lwz 8, 4(3)
@@ -482,11 +776,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    lwz 0, 24(3)
 ; LE-32BIT-NEXT:    lwz 3, 28(3)
 ; LE-32BIT-NEXT:    lwz 4, 28(4)
-; LE-32BIT-NEXT:    stw 3, 76(1)
-; LE-32BIT-NEXT:    addi 3, 1, 48
-; LE-32BIT-NEXT:    clrlwi 4, 4, 27
 ; LE-32BIT-NEXT:    stw 6, 44(1)
-; LE-32BIT-NEXT:    sub 3, 3, 4
 ; LE-32BIT-NEXT:    stw 6, 40(1)
 ; LE-32BIT-NEXT:    stw 6, 36(1)
 ; LE-32BIT-NEXT:    stw 6, 32(1)
@@ -494,30 +784,70 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    stw 6, 24(1)
 ; LE-32BIT-NEXT:    stw 6, 20(1)
 ; LE-32BIT-NEXT:    stw 6, 16(1)
+; LE-32BIT-NEXT:    rlwinm 6, 4, 0, 27, 29
+; LE-32BIT-NEXT:    stw 3, 76(1)
+; LE-32BIT-NEXT:    addi 3, 1, 48
+; LE-32BIT-NEXT:    stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    sub 3, 3, 6
+; LE-32BIT-NEXT:    stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT:    stw 27, 92(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 28, 96(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 29, 100(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 30, 104(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 0, 72(1)
+; LE-32BIT-NEXT:    subfic 0, 4, 32
 ; LE-32BIT-NEXT:    stw 12, 68(1)
 ; LE-32BIT-NEXT:    stw 11, 64(1)
 ; LE-32BIT-NEXT:    stw 10, 60(1)
 ; LE-32BIT-NEXT:    stw 9, 56(1)
 ; LE-32BIT-NEXT:    stw 8, 52(1)
 ; LE-32BIT-NEXT:    stw 7, 48(1)
-; LE-32BIT-NEXT:    lwz 4, 4(3)
-; LE-32BIT-NEXT:    lwz 6, 0(3)
-; LE-32BIT-NEXT:    lwz 7, 12(3)
-; LE-32BIT-NEXT:    lwz 8, 8(3)
-; LE-32BIT-NEXT:    lwz 9, 20(3)
-; LE-32BIT-NEXT:    lwz 10, 16(3)
-; LE-32BIT-NEXT:    lwz 11, 24(3)
-; LE-32BIT-NEXT:    lwz 3, 28(3)
-; LE-32BIT-NEXT:    stw 11, 24(5)
+; LE-32BIT-NEXT:    lwz 6, 4(3)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    lwz 8, 12(3)
+; LE-32BIT-NEXT:    srw 30, 6, 4
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    slw 29, 7, 0
+; LE-32BIT-NEXT:    lwz 10, 20(3)
+; LE-32BIT-NEXT:    srw 28, 8, 4
+; LE-32BIT-NEXT:    lwz 11, 16(3)
+; LE-32BIT-NEXT:    slw 27, 9, 0
+; LE-32BIT-NEXT:    lwz 12, 28(3)
+; LE-32BIT-NEXT:    slw 6, 6, 0
+; LE-32BIT-NEXT:    lwz 3, 24(3)
+; LE-32BIT-NEXT:    srw 26, 10, 4
+; LE-32BIT-NEXT:    slw 25, 11, 0
+; LE-32BIT-NEXT:    slw 8, 8, 0
+; LE-32BIT-NEXT:    slw 10, 10, 0
+; LE-32BIT-NEXT:    slw 0, 3, 0
+; LE-32BIT-NEXT:    srw 3, 3, 4
+; LE-32BIT-NEXT:    srw 12, 12, 4
+; LE-32BIT-NEXT:    or 3, 10, 3
+; LE-32BIT-NEXT:    srw 11, 11, 4
+; LE-32BIT-NEXT:    stw 3, 24(5)
+; LE-32BIT-NEXT:    or 3, 0, 12
 ; LE-32BIT-NEXT:    stw 3, 28(5)
-; LE-32BIT-NEXT:    stw 10, 16(5)
-; LE-32BIT-NEXT:    stw 9, 20(5)
-; LE-32BIT-NEXT:    stw 8, 8(5)
-; LE-32BIT-NEXT:    stw 7, 12(5)
-; LE-32BIT-NEXT:    stw 6, 0(5)
-; LE-32BIT-NEXT:    stw 4, 4(5)
-; LE-32BIT-NEXT:    addi 1, 1, 80
+; LE-32BIT-NEXT:    or 3, 8, 11
+; LE-32BIT-NEXT:    srw 9, 9, 4
+; LE-32BIT-NEXT:    stw 3, 16(5)
+; LE-32BIT-NEXT:    or 3, 25, 26
+; LE-32BIT-NEXT:    stw 3, 20(5)
+; LE-32BIT-NEXT:    or 3, 6, 9
+; LE-32BIT-NEXT:    stw 3, 8(5)
+; LE-32BIT-NEXT:    or 3, 27, 28
+; LE-32BIT-NEXT:    srw 4, 7, 4
+; LE-32BIT-NEXT:    stw 3, 12(5)
+; LE-32BIT-NEXT:    or 3, 29, 30
+; LE-32BIT-NEXT:    stw 4, 0(5)
+; LE-32BIT-NEXT:    stw 3, 4(5)
+; LE-32BIT-NEXT:    lwz 30, 104(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 29, 100(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 28, 96(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 27, 92(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 26, 88(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 25, 84(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    addi 1, 1, 112
 ; LE-32BIT-NEXT:    blr
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -526,32 +856,297 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
   store i256 %res, ptr %dst, align 1
   ret void
 }
-define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; LE-64BIT-LABEL: shl_32bytes:
+
+define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: lshr_32bytes_wordOff:
 ; LE-64BIT:       # %bb.0:
 ; LE-64BIT-NEXT:    li 6, 16
 ; LE-64BIT-NEXT:    lxvd2x 1, 0, 3
 ; LE-64BIT-NEXT:    xxlxor 2, 2, 2
-; LE-64BIT-NEXT:    li 7, 48
+; LE-64BIT-NEXT:    addi 7, 1, -64
+; LE-64BIT-NEXT:    li 8, 32
 ; LE-64BIT-NEXT:    lxvd2x 0, 3, 6
 ; LE-64BIT-NEXT:    lwz 3, 0(4)
-; LE-64BIT-NEXT:    addi 4, 1, -64
-; LE-64BIT-NEXT:    stxvd2x 2, 4, 6
-; LE-64BIT-NEXT:    clrlwi 3, 3, 27
-; LE-64BIT-NEXT:    stxvd2x 0, 4, 7
-; LE-64BIT-NEXT:    li 7, 32
-; LE-64BIT-NEXT:    neg 3, 3
-; LE-64BIT-NEXT:    stxvd2x 1, 4, 7
-; LE-64BIT-NEXT:    stxvd2x 2, 0, 4
-; LE-64BIT-NEXT:    extsw 3, 3
-; LE-64BIT-NEXT:    addi 4, 1, -32
-; LE-64BIT-NEXT:    lxvd2x 0, 4, 3
-; LE-64BIT-NEXT:    add 3, 4, 3
+; LE-64BIT-NEXT:    li 4, 48
+; LE-64BIT-NEXT:    stxvd2x 2, 7, 4
+; LE-64BIT-NEXT:    stxvd2x 2, 7, 8
+; LE-64BIT-NEXT:    rlwinm 4, 3, 2, 27, 28
+; LE-64BIT-NEXT:    rlwinm 3, 3, 5, 26, 26
+; LE-64BIT-NEXT:    stxvd2x 0, 7, 6
+; LE-64BIT-NEXT:    stxvd2x 1, 0, 7
+; LE-64BIT-NEXT:    ldux 6, 4, 7
+; LE-64BIT-NEXT:    subfic 7, 3, 64
+; LE-64BIT-NEXT:    ld 8, 8(4)
+; LE-64BIT-NEXT:    ld 9, 16(4)
+; LE-64BIT-NEXT:    ld 4, 24(4)
+; LE-64BIT-NEXT:    srd 6, 6, 3
+; LE-64BIT-NEXT:    sld 10, 8, 7
+; LE-64BIT-NEXT:    sld 11, 4, 7
+; LE-64BIT-NEXT:    srd 8, 8, 3
+; LE-64BIT-NEXT:    sld 7, 9, 7
+; LE-64BIT-NEXT:    or 6, 10, 6
+; LE-64BIT-NEXT:    srd 10, 9, 3
+; LE-64BIT-NEXT:    srd 3, 4, 3
+; LE-64BIT-NEXT:    or 7, 7, 8
+; LE-64BIT-NEXT:    std 3, 24(5)
+; LE-64BIT-NEXT:    or 3, 11, 10
+; LE-64BIT-NEXT:    std 7, 8(5)
+; LE-64BIT-NEXT:    std 6, 0(5)
+; LE-64BIT-NEXT:    std 3, 16(5)
+; LE-64BIT-NEXT:    blr
+;
+; BE-LABEL: lshr_32bytes_wordOff:
+; BE:       # %bb.0:
+; BE-NEXT:    ld 6, 0(3)
+; BE-NEXT:    ld 7, 8(3)
+; BE-NEXT:    ld 8, 16(3)
+; BE-NEXT:    ld 3, 24(3)
+; BE-NEXT:    lwz 4, 28(4)
+; BE-NEXT:    li 9, 0
+; BE-NEXT:    addi 10, 1, -32
+; BE-NEXT:    std 9, -40(1)
+; BE-NEXT:    std 9, -48(1)
+; BE-NEXT:    std 9, -56(1)
+; BE-NEXT:    std 9, -64(1)
+; BE-NEXT:    std 3, -8(1)
+; BE-NEXT:    rlwinm 3, 4, 2, 27, 28
+; BE-NEXT:    neg 3, 3
+; BE-NEXT:    std 8, -16(1)
+; BE-NEXT:    std 7, -24(1)
+; BE-NEXT:    std 6, -32(1)
+; BE-NEXT:    extsw 3, 3
+; BE-NEXT:    ldux 3, 10, 3
+; BE-NEXT:    rlwinm 4, 4, 5, 26, 26
+; BE-NEXT:    subfic 9, 4, 64
+; BE-NEXT:    ld 6, 8(10)
+; BE-NEXT:    ld 7, 24(10)
+; BE-NEXT:    ld 8, 16(10)
+; BE-NEXT:    sld 10, 3, 9
+; BE-NEXT:    srd 3, 3, 4
+; BE-NEXT:    std 3, 0(5)
+; BE-NEXT:    srd 11, 6, 4
+; BE-NEXT:    srd 7, 7, 4
+; BE-NEXT:    sld 6, 6, 9
+; BE-NEXT:    sld 9, 8, 9
+; BE-NEXT:    srd 8, 8, 4
+; BE-NEXT:    or 10, 10, 11
+; BE-NEXT:    or 7, 9, 7
+; BE-NEXT:    or 6, 6, 8
+; BE-NEXT:    std 6, 16(5)
+; BE-NEXT:    std 7, 24(5)
+; BE-NEXT:    std 10, 8(5)
+; BE-NEXT:    blr
+;
+; LE-32BIT-LABEL: lshr_32bytes_wordOff:
+; LE-32BIT:       # %bb.0:
+; LE-32BIT-NEXT:    stwu 1, -80(1)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    li 6, 0
+; LE-32BIT-NEXT:    lwz 8, 4(3)
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 10, 12(3)
+; LE-32BIT-NEXT:    lwz 11, 16(3)
+; LE-32BIT-NEXT:    lwz 12, 20(3)
+; LE-32BIT-NEXT:    lwz 0, 24(3)
+; LE-32BIT-NEXT:    lwz 3, 28(3)
+; LE-32BIT-NEXT:    lwz 4, 28(4)
+; LE-32BIT-NEXT:    stw 3, 76(1)
+; LE-32BIT-NEXT:    addi 3, 1, 48
+; LE-32BIT-NEXT:    rlwinm 4, 4, 2, 27, 29
+; LE-32BIT-NEXT:    stw 6, 44(1)
+; LE-32BIT-NEXT:    sub 3, 3, 4
+; LE-32BIT-NEXT:    stw 6, 40(1)
+; LE-32BIT-NEXT:    stw 6, 36(1)
+; LE-32BIT-NEXT:    stw 6, 32(1)
+; LE-32BIT-NEXT:    stw 6, 28(1)
+; LE-32BIT-NEXT:    stw 6, 24(1)
+; LE-32BIT-NEXT:    stw 6, 20(1)
+; LE-32BIT-NEXT:    stw 6, 16(1)
+; LE-32BIT-NEXT:    stw 0, 72(1)
+; LE-32BIT-NEXT:    stw 12, 68(1)
+; LE-32BIT-NEXT:    stw 11, 64(1)
+; LE-32BIT-NEXT:    stw 10, 60(1)
+; LE-32BIT-NEXT:    stw 9, 56(1)
+; LE-32BIT-NEXT:    stw 8, 52(1)
+; LE-32BIT-NEXT:    stw 7, 48(1)
+; LE-32BIT-NEXT:    lwz 4, 4(3)
+; LE-32BIT-NEXT:    lwz 6, 0(3)
+; LE-32BIT-NEXT:    lwz 7, 12(3)
+; LE-32BIT-NEXT:    lwz 8, 8(3)
+; LE-32BIT-NEXT:    lwz 9, 20(3)
+; LE-32BIT-NEXT:    lwz 10, 16(3)
+; LE-32BIT-NEXT:    lwz 11, 24(3)
+; LE-32BIT-NEXT:    lwz 3, 28(3)
+; LE-32BIT-NEXT:    stw 11, 24(5)
+; LE-32BIT-NEXT:    stw 3, 28(5)
+; LE-32BIT-NEXT:    stw 10, 16(5)
+; LE-32BIT-NEXT:    stw 9, 20(5)
+; LE-32BIT-NEXT:    stw 8, 8(5)
+; LE-32BIT-NEXT:    stw 7, 12(5)
+; LE-32BIT-NEXT:    stw 6, 0(5)
+; LE-32BIT-NEXT:    stw 4, 4(5)
+; LE-32BIT-NEXT:    addi 1, 1, 80
+; LE-32BIT-NEXT:    blr
+  %src = load i256, ptr %src.ptr, align 1
+  %wordOff = load i256, ptr %wordOff.ptr, align 1
+  %bitOff = shl i256 %wordOff, 5
+  %res = lshr i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: lshr_32bytes_dwordOff:
+; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    li 6, 16
+; LE-64BIT-NEXT:    lxvd2x 1, 0, 3
+; LE-64BIT-NEXT:    xxlxor 2, 2, 2
+; LE-64BIT-NEXT:    addi 7, 1, -64
+; LE-64BIT-NEXT:    lxvd2x 0, 3, 6
+; LE-64BIT-NEXT:    lwz 3, 0(4)
+; LE-64BIT-NEXT:    li 4, 48
+; LE-64BIT-NEXT:    stxvd2x 2, 7, 4
+; LE-64BIT-NEXT:    li 4, 32
+; LE-64BIT-NEXT:    rlwinm 3, 3, 3, 27, 28
+; LE-64BIT-NEXT:    stxvd2x 2, 7, 4
+; LE-64BIT-NEXT:    stxvd2x 0, 7, 6
+; LE-64BIT-NEXT:    stxvd2x 1, 0, 7
+; LE-64BIT-NEXT:    lxvd2x 0, 7, 3
+; LE-64BIT-NEXT:    add 3, 7, 3
 ; LE-64BIT-NEXT:    lxvd2x 1, 3, 6
 ; LE-64BIT-NEXT:    stxvd2x 1, 5, 6
 ; LE-64BIT-NEXT:    stxvd2x 0, 0, 5
 ; LE-64BIT-NEXT:    blr
 ;
+; BE-LABEL: lshr_32bytes_dwordOff:
+; BE:       # %bb.0:
+; BE-NEXT:    ld 7, 0(3)
+; BE-NEXT:    ld 8, 8(3)
+; BE-NEXT:    ld 9, 16(3)
+; BE-NEXT:    ld 3, 24(3)
+; BE-NEXT:    lwz 4, 28(4)
+; BE-NEXT:    li 6, 0
+; BE-NEXT:    std 6, -40(1)
+; BE-NEXT:    std 6, -48(1)
+; BE-NEXT:    std 6, -56(1)
+; BE-NEXT:    std 6, -64(1)
+; BE-NEXT:    std 3, -8(1)
+; BE-NEXT:    rlwinm 3, 4, 3, 27, 28
+; BE-NEXT:    neg 3, 3
+; BE-NEXT:    std 9, -16(1)
+; BE-NEXT:    std 8, -24(1)
+; BE-NEXT:    std 7, -32(1)
+; BE-NEXT:    extsw 3, 3
+; BE-NEXT:    addi 4, 1, -32
+; BE-NEXT:    ldux 3, 4, 3
+; BE-NEXT:    ld 6, 8(4)
+; BE-NEXT:    ld 7, 24(4)
+; BE-NEXT:    ld 4, 16(4)
+; BE-NEXT:    std 3, 0(5)
+; BE-NEXT:    std 4, 16(5)
+; BE-NEXT:    std 7, 24(5)
+; BE-NEXT:    std 6, 8(5)
+; BE-NEXT:    blr
+;
+; LE-32BIT-LABEL: lshr_32bytes_dwordOff:
+; LE-32BIT:       # %bb.0:
+; LE-32BIT-NEXT:    stwu 1, -80(1)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    li 6, 0
+; LE-32BIT-NEXT:    lwz 8, 4(3)
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 10, 12(3)
+; LE-32BIT-NEXT:    lwz 11, 16(3)
+; LE-32BIT-NEXT:    lwz 12, 20(3)
+; LE-32BIT-NEXT:    lwz 0, 24(3)
+; LE-32BIT-NEXT:    lwz 3, 28(3)
+; LE-32BIT-NEXT:    lwz 4, 28(4)
+; LE-32BIT-NEXT:    stw 3, 76(1)
+; LE-32BIT-NEXT:    addi 3, 1, 48
+; LE-32BIT-NEXT:    rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT:    stw 6, 44(1)
+; LE-32BIT-NEXT:    sub 3, 3, 4
+; LE-32BIT-NEXT:    stw 6, 40(1)
+; LE-32BIT-NEXT:    stw 6, 36(1)
+; LE-32BIT-NEXT:    stw 6, 32(1)
+; LE-32BIT-NEXT:    stw 6, 28(1)
+; LE-32BIT-NEXT:    stw 6, 24(1)
+; LE-32BIT-NEXT:    stw 6, 20(1)
+; LE-32BIT-NEXT:    stw 6, 16(1)
+; LE-32BIT-NEXT:    stw 0, 72(1)
+; LE-32BIT-NEXT:    stw 12, 68(1)
+; LE-32BIT-NEXT:    stw 11, 64(1)
+; LE-32BIT-NEXT:    stw 10, 60(1)
+; LE-32BIT-NEXT:    stw 9, 56(1)
+; LE-32BIT-NEXT:    stw 8, 52(1)
+; LE-32BIT-NEXT:    stw 7, 48(1)
+; LE-32BIT-NEXT:    lwz 4, 4(3)
+; LE-32BIT-NEXT:    lwz 6, 0(3)
+; LE-32BIT-NEXT:    lwz 7, 12(3)
+; LE-32BIT-NEXT:    lwz 8, 8(3)
+; LE-32BIT-NEXT:    lwz 9, 20(3)
+; LE-32BIT-NEXT:    lwz 10, 16(3)
+; LE-32BIT-NEXT:    lwz 11, 24(3)
+; LE-32BIT-NEXT:    lwz 3, 28(3)
+; LE-32BIT-NEXT:    stw 11, 24(5)
+; LE-32BIT-NEXT:    stw 3, 28(5)
+; LE-32BIT-NEXT:    stw 10, 16(5)
+; LE-32BIT-NEXT:    stw 9, 20(5)
+; LE-32BIT-NEXT:    stw 8, 8(5)
+; LE-32BIT-NEXT:    stw 7, 12(5)
+; LE-32BIT-NEXT:    stw 6, 0(5)
+; LE-32BIT-NEXT:    stw 4, 4(5)
+; LE-32BIT-NEXT:    addi 1, 1, 80
+; LE-32BIT-NEXT:    blr
+  %src = load i256, ptr %src.ptr, align 1
+  %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+  %bitOff = shl i256 %dwordOff, 6
+  %res = lshr i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: shl_32bytes:
+; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    li 6, 16
+; LE-64BIT-NEXT:    lwz 4, 0(4)
+; LE-64BIT-NEXT:    xxlxor 2, 2, 2
+; LE-64BIT-NEXT:    addi 7, 1, -64
+; LE-64BIT-NEXT:    lxvd2x 1, 0, 3
+; LE-64BIT-NEXT:    addi 8, 1, -32
+; LE-64BIT-NEXT:    lxvd2x 0, 3, 6
+; LE-64BIT-NEXT:    stxvd2x 2, 7, 6
+; LE-64BIT-NEXT:    li 6, 48
+; LE-64BIT-NEXT:    rlwinm 3, 4, 0, 27, 28
+; LE-64BIT-NEXT:    rlwinm 4, 4, 3, 26, 28
+; LE-64BIT-NEXT:    neg 3, 3
+; LE-64BIT-NEXT:    stxvd2x 0, 7, 6
+; LE-64BIT-NEXT:    li 6, 32
+; LE-64BIT-NEXT:    extsw 3, 3
+; LE-64BIT-NEXT:    stxvd2x 1, 7, 6
+; LE-64BIT-NEXT:    stxvd2x 2, 0, 7
+; LE-64BIT-NEXT:    subfic 6, 4, 64
+; LE-64BIT-NEXT:    ldux 3, 8, 3
+; LE-64BIT-NEXT:    ld 7, 16(8)
+; LE-64BIT-NEXT:    ld 9, 24(8)
+; LE-64BIT-NEXT:    ld 8, 8(8)
+; LE-64BIT-NEXT:    srd 10, 7, 6
+; LE-64BIT-NEXT:    sld 9, 9, 4
+; LE-64BIT-NEXT:    sld 7, 7, 4
+; LE-64BIT-NEXT:    or 9, 9, 10
+; LE-64BIT-NEXT:    srd 10, 8, 6
+; LE-64BIT-NEXT:    srd 6, 3, 6
+; LE-64BIT-NEXT:    sld 8, 8, 4
+; LE-64BIT-NEXT:    sld 3, 3, 4
+; LE-64BIT-NEXT:    or 6, 8, 6
+; LE-64BIT-NEXT:    std 3, 0(5)
+; LE-64BIT-NEXT:    or 3, 7, 10
+; LE-64BIT-NEXT:    std 9, 24(5)
+; LE-64BIT-NEXT:    std 6, 8(5)
+; LE-64BIT-NEXT:    std 3, 16(5)
+; LE-64BIT-NEXT:    blr
+;
 ; BE-LABEL: shl_32bytes:
 ; BE:       # %bb.0:
 ; BE-NEXT:    ld 6, 0(3)
@@ -559,29 +1154,215 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; BE-NEXT:    ld 8, 16(3)
 ; BE-NEXT:    ld 3, 24(3)
 ; BE-NEXT:    lwz 4, 28(4)
-; BE-NEXT:    addi 9, 1, -64
-; BE-NEXT:    li 10, 0
-; BE-NEXT:    std 10, 56(9)
-; BE-NEXT:    std 10, 48(9)
-; BE-NEXT:    std 10, 40(9)
-; BE-NEXT:    std 10, 32(9)
-; BE-NEXT:    std 3, 24(9)
-; BE-NEXT:    std 8, 16(9)
-; BE-NEXT:    std 7, 8(9)
+; BE-NEXT:    li 9, 0
+; BE-NEXT:    addi 10, 1, -64
+; BE-NEXT:    std 9, -8(1)
+; BE-NEXT:    std 9, -16(1)
+; BE-NEXT:    std 9, -24(1)
+; BE-NEXT:    std 9, -32(1)
+; BE-NEXT:    std 3, -40(1)
+; BE-NEXT:    std 8, -48(1)
+; BE-NEXT:    std 7, -56(1)
 ; BE-NEXT:    std 6, -64(1)
-; BE-NEXT:    clrldi 3, 4, 59
-; BE-NEXT:    ldux 4, 3, 9
-; BE-NEXT:    ld 6, 8(3)
-; BE-NEXT:    ld 7, 24(3)
-; BE-NEXT:    ld 3, 16(3)
-; BE-NEXT:    std 4, 0(5)
-; BE-NEXT:    std 3, 16(5)
-; BE-NEXT:    std 7, 24(5)
-; BE-NEXT:    std 6, 8(5)
+; BE-NEXT:    rlwinm 3, 4, 0, 27, 28
+; BE-NEXT:    ldux 6, 3, 10
+; BE-NEXT:    rlwinm 4, 4, 3, 26, 28
+; BE-NEXT:    subfic 9, 4, 64
+; BE-NEXT:    ld 7, 16(3)
+; BE-NEXT:    ld 8, 8(3)
+; BE-NEXT:    ld 3, 24(3)
+; BE-NEXT:    sld 6, 6, 4
+; BE-NEXT:    srd 10, 7, 9
+; BE-NEXT:    sld 11, 8, 4
+; BE-NEXT:    srd 8, 8, 9
+; BE-NEXT:    srd 9, 3, 9
+; BE-NEXT:    sld 7, 7, 4
+; BE-NEXT:    sld 3, 3, 4
+; BE-NEXT:    or 10, 11, 10
+; BE-NEXT:    or 6, 6, 8
+; BE-NEXT:    or 7, 7, 9
+; BE-NEXT:    std 3, 24(5)
+; BE-NEXT:    std 7, 16(5)
+; BE-NEXT:    std 6, 0(5)
+; BE-NEXT:    std 10, 8(5)
 ; BE-NEXT:    blr
 ;
 ; LE-32BIT-LABEL: shl_32bytes:
 ; LE-32BIT:       # %bb.0:
+; LE-32BIT-NEXT:    stwu 1, -112(1)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    li 6, 0
+; LE-32BIT-NEXT:    lwz 8, 4(3)
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 10, 12(3)
+; LE-32BIT-NEXT:    lwz 11, 16(3)
+; LE-32BIT-NEXT:    lwz 12, 20(3)
+; LE-32BIT-NEXT:    lwz 0, 24(3)
+; LE-32BIT-NEXT:    lwz 3, 28(3)
+; LE-32BIT-NEXT:    lwz 4, 28(4)
+; LE-32BIT-NEXT:    stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 27, 92(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 28, 96(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 29, 100(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 30, 104(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 6, 76(1)
+; LE-32BIT-NEXT:    stw 6, 72(1)
+; LE-32BIT-NEXT:    stw 6, 68(1)
+; LE-32BIT-NEXT:    stw 6, 64(1)
+; LE-32BIT-NEXT:    stw 6, 60(1)
+; LE-32BIT-NEXT:    stw 6, 56(1)
+; LE-32BIT-NEXT:    stw 6, 52(1)
+; LE-32BIT-NEXT:    stw 6, 48(1)
+; LE-32BIT-NEXT:    rlwinm 6, 4, 0, 27, 29
+; LE-32BIT-NEXT:    stw 3, 44(1)
+; LE-32BIT-NEXT:    addi 3, 1, 16
+; LE-32BIT-NEXT:    stw 0, 40(1)
+; LE-32BIT-NEXT:    rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT:    stw 12, 36(1)
+; LE-32BIT-NEXT:    subfic 12, 4, 32
+; LE-32BIT-NEXT:    stw 11, 32(1)
+; LE-32BIT-NEXT:    stw 10, 28(1)
+; LE-32BIT-NEXT:    stw 9, 24(1)
+; LE-32BIT-NEXT:    stw 8, 20(1)
+; LE-32BIT-NEXT:    stw 7, 16(1)
+; LE-32BIT-NEXT:    lwzux 3, 6, 3
+; LE-32BIT-NEXT:    lwz 7, 8(6)
+; LE-32BIT-NEXT:    slw 3, 3, 4
+; LE-32BIT-NEXT:    lwz 8, 4(6)
+; LE-32BIT-NEXT:    lwz 9, 16(6)
+; LE-32BIT-NEXT:    srw 30, 7, 12
+; LE-32BIT-NEXT:    lwz 10, 12(6)
+; LE-32BIT-NEXT:    slw 29, 8, 4
+; LE-32BIT-NEXT:    lwz 11, 24(6)
+; LE-32BIT-NEXT:    srw 8, 8, 12
+; LE-32BIT-NEXT:    lwz 0, 20(6)
+; LE-32BIT-NEXT:    srw 28, 9, 12
+; LE-32BIT-NEXT:    lwz 6, 28(6)
+; LE-32BIT-NEXT:    slw 27, 10, 4
+; LE-32BIT-NEXT:    srw 10, 10, 12
+; LE-32BIT-NEXT:    slw 7, 7, 4
+; LE-32BIT-NEXT:    srw 26, 11, 12
+; LE-32BIT-NEXT:    slw 25, 0, 4
+; LE-32BIT-NEXT:    srw 0, 0, 12
+; LE-32BIT-NEXT:    slw 9, 9, 4
+; LE-32BIT-NEXT:    srw 12, 6, 12
+; LE-32BIT-NEXT:    slw 11, 11, 4
+; LE-32BIT-NEXT:    slw 4, 6, 4
+; LE-32BIT-NEXT:    stw 4, 28(5)
+; LE-32BIT-NEXT:    or 4, 11, 12
+; LE-32BIT-NEXT:    stw 4, 24(5)
+; LE-32BIT-NEXT:    or 4, 9, 0
+; LE-32BIT-NEXT:    stw 4, 16(5)
+; LE-32BIT-NEXT:    or 4, 25, 26
+; LE-32BIT-NEXT:    stw 4, 20(5)
+; LE-32BIT-NEXT:    or 4, 7, 10
+; LE-32BIT-NEXT:    or 3, 3, 8
+; LE-32BIT-NEXT:    stw 4, 8(5)
+; LE-32BIT-NEXT:    or 4, 27, 28
+; LE-32BIT-NEXT:    stw 3, 0(5)
+; LE-32BIT-NEXT:    or 3, 29, 30
+; LE-32BIT-NEXT:    stw 4, 12(5)
+; LE-32BIT-NEXT:    stw 3, 4(5)
+; LE-32BIT-NEXT:    lwz 30, 104(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 29, 100(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 28, 96(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 27, 92(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 26, 88(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 25, 84(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    addi 1, 1, 112
+; LE-32BIT-NEXT:    blr
+  %src = load i256, ptr %src.ptr, align 1
+  %byteOff = load i256, ptr %byteOff.ptr, align 1
+  %bitOff = shl i256 %byteOff, 3
+  %res = shl i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: shl_32bytes_wordOff:
+; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    li 6, 16
+; LE-64BIT-NEXT:    lwz 4, 0(4)
+; LE-64BIT-NEXT:    xxlxor 2, 2, 2
+; LE-64BIT-NEXT:    addi 7, 1, -64
+; LE-64BIT-NEXT:    lxvd2x 1, 0, 3
+; LE-64BIT-NEXT:    addi 8, 1, -32
+; LE-64BIT-NEXT:    lxvd2x 0, 3, 6
+; LE-64BIT-NEXT:    stxvd2x 2, 7, 6
+; LE-64BIT-NEXT:    li 6, 48
+; LE-64BIT-NEXT:    rlwinm 3, 4, 2, 27, 28
+; LE-64BIT-NEXT:    rlwinm 4, 4, 5, 26, 26
+; LE-64BIT-NEXT:    neg 3, 3
+; LE-64BIT-NEXT:    stxvd2x 0, 7, 6
+; LE-64BIT-NEXT:    li 6, 32
+; LE-64BIT-NEXT:    extsw 3, 3
+; LE-64BIT-NEXT:    stxvd2x 1, 7, 6
+; LE-64BIT-NEXT:    stxvd2x 2, 0, 7
+; LE-64BIT-NEXT:    subfic 6, 4, 64
+; LE-64BIT-NEXT:    ldux 3, 8, 3
+; LE-64BIT-NEXT:    ld 7, 16(8)
+; LE-64BIT-NEXT:    ld 9, 24(8)
+; LE-64BIT-NEXT:    ld 8, 8(8)
+; LE-64BIT-NEXT:    srd 10, 7, 6
+; LE-64BIT-NEXT:    sld 9, 9, 4
+; LE-64BIT-NEXT:    sld 7, 7, 4
+; LE-64BIT-NEXT:    or 9, 9, 10
+; LE-64BIT-NEXT:    srd 10, 8, 6
+; LE-64BIT-NEXT:    srd 6, 3, 6
+; LE-64BIT-NEXT:    sld 8, 8, 4
+; LE-64BIT-NEXT:    sld 3, 3, 4
+; LE-64BIT-NEXT:    or 6, 8, 6
+; LE-64BIT-NEXT:    std 3, 0(5)
+; LE-64BIT-NEXT:    or 3, 7, 10
+; LE-64BIT-NEXT:    std 9, 24(5)
+; LE-64BIT-NEXT:    std 6, 8(5)
+; LE-64BIT-NEXT:    std 3, 16(5)
+; LE-64BIT-NEXT:    blr
+;
+; BE-LABEL: shl_32bytes_wordOff:
+; BE:       # %bb.0:
+; BE-NEXT:    ld 6, 0(3)
+; BE-NEXT:    ld 7, 8(3)
+; BE-NEXT:    ld 8, 16(3)
+; BE-NEXT:    ld 3, 24(3)
+; BE-NEXT:    lwz 4, 28(4)
+; BE-NEXT:    li 9, 0
+; BE-NEXT:    addi 10, 1, -64
+; BE-NEXT:    std 9, -8(1)
+; BE-NEXT:    std 9, -16(1)
+; BE-NEXT:    std 9, -24(1)
+; BE-NEXT:    std 9, -32(1)
+; BE-NEXT:    std 3, -40(1)
+; BE-NEXT:    std 8, -48(1)
+; BE-NEXT:    std 7, -56(1)
+; BE-NEXT:    std 6, -64(1)
+; BE-NEXT:    rlwinm 3, 4, 2, 27, 28
+; BE-NEXT:    ldux 6, 3, 10
+; BE-NEXT:    rlwinm 4, 4, 5, 26, 26
+; BE-NEXT:    subfic 9, 4, 64
+; BE-NEXT:    ld 7, 16(3)
+; BE-NEXT:    ld 8, 8(3)
+; BE-NEXT:    ld 3, 24(3)
+; BE-NEXT:    sld 6, 6, 4
+; BE-NEXT:    srd 10, 7, 9
+; BE-NEXT:    sld 11, 8, 4
+; BE-NEXT:    srd 8, 8, 9
+; BE-NEXT:    srd 9, 3, 9
+; BE-NEXT:    sld 7, 7, 4
+; BE-NEXT:    sld 3, 3, 4
+; BE-NEXT:    or 10, 11, 10
+; BE-NEXT:    or 6, 6, 8
+; BE-NEXT:    or 7, 7, 9
+; BE-NEXT:    std 3, 24(5)
+; BE-NEXT:    std 7, 16(5)
+; BE-NEXT:    std 6, 0(5)
+; BE-NEXT:    std 10, 8(5)
+; BE-NEXT:    blr
+;
+; LE-32BIT-LABEL: shl_32bytes_wordOff:
+; LE-32BIT:       # %bb.0:
 ; LE-32BIT-NEXT:    stwu 1, -80(1)
 ; LE-32BIT-NEXT:    lwz 7, 0(3)
 ; LE-32BIT-NEXT:    li 6, 0
@@ -595,7 +1376,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    lwz 4, 28(4)
 ; LE-32BIT-NEXT:    stw 6, 76(1)
 ; LE-32BIT-NEXT:    stw 6, 72(1)
-; LE-32BIT-NEXT:    clrlwi 4, 4, 27
+; LE-32BIT-NEXT:    rlwinm 4, 4, 2, 27, 29
 ; LE-32BIT-NEXT:    stw 6, 68(1)
 ; LE-32BIT-NEXT:    stw 6, 64(1)
 ; LE-32BIT-NEXT:    stw 6, 60(1)
@@ -630,69 +1411,496 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    addi 1, 1, 80
 ; LE-32BIT-NEXT:    blr
   %src = load i256, ptr %src.ptr, align 1
-  %byteOff = load i256, ptr %byteOff.ptr, align 1
-  %bitOff = shl i256 %byteOff, 3
+  %wordOff = load i256, ptr %wordOff.ptr, align 1
+  %bitOff = shl i256 %wordOff, 5
+  %res = shl i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: shl_32bytes_dwordOff:
+; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    li 6, 16
+; LE-64BIT-NEXT:    lxvd2x 1, 0, 3
+; LE-64BIT-NEXT:    xxlxor 2, 2, 2
+; LE-64BIT-NEXT:    li 7, 48
+; LE-64BIT-NEXT:    lxvd2x 0, 3, 6
+; LE-64BIT-NEXT:    lwz 3, 0(4)
+; LE-64BIT-NEXT:    addi 4, 1, -64
+; LE-64BIT-NEXT:    stxvd2x 2, 4, 6
+; LE-64BIT-NEXT:    rlwinm 3, 3, 3, 27, 28
+; LE-64BIT-NEXT:    stxvd2x 0, 4, 7
+; LE-64BIT-NEXT:    li 7, 32
+; LE-64BIT-NEXT:    neg 3, 3
+; LE-64BIT-NEXT:    stxvd2x 1, 4, 7
+; LE-64BIT-NEXT:    stxvd2x 2, 0, 4
+; LE-64BIT-NEXT:    extsw 3, 3
+; LE-64BIT-NEXT:    addi 4, 1, -32
+; LE-64BIT-NEXT:    lxvd2x 0, 4, 3
+; LE-64BIT-NEXT:    add 3, 4, 3
+; LE-64BIT-NEXT:    lxvd2x 1, 3, 6
+; LE-64BIT-NEXT:    stxvd2x 1, 5, 6
+; LE-64BIT-NEXT:    stxvd2x 0, 0, 5
+; LE-64BIT-NEXT:    blr
+;
+; BE-LABEL: shl_32bytes_dwordOff:
+; BE:       # %bb.0:
+; BE-NEXT:    ld 7, 0(3)
+; BE-NEXT:    ld 8, 8(3)
+; BE-NEXT:    ld 9, 16(3)
+; BE-NEXT:    ld 3, 24(3)
+; BE-NEXT:    lwz 4, 28(4)
+; BE-NEXT:    li 6, 0
+; BE-NEXT:    std 6, -8(1)
+; BE-NEXT:    std 6, -16(1)
+; BE-NEXT:    std 6, -24(1)
+; BE-NEXT:    std 6, -32(1)
+; BE-NEXT:    std 3, -40(1)
+; BE-NEXT:    std 9, -48(1)
+; BE-NEXT:    std 8, -56(1)
+; BE-NEXT:    std 7, -64(1)
+; BE-NEXT:    rlwinm 3, 4, 3, 27, 28
+; BE-NEXT:    addi 4, 1, -64
+; BE-NEXT:    ldux 4, 3, 4
+; BE-NEXT:    ld 6, 8(3)
+; BE-NEXT:    ld 7, 24(3)
+; BE-NEXT:    ld 3, 16(3)
+; BE-NEXT:    std 4, 0(5)
+; BE-NEXT:    std 3, 16(5)
+; BE-NEXT:    std 7, 24(5)
+; BE-NEXT:    std 6, 8(5)
+; BE-NEXT:    blr
+;
+; LE-32BIT-LABEL: shl_32bytes_dwordOff:
+; LE-32BIT:       # %bb.0:
+; LE-32BIT-NEXT:    stwu 1, -80(1)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    li 6, 0
+; LE-32BIT-NEXT:    lwz 8, 4(3)
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 10, 12(3)
+; LE-32BIT-NEXT:    lwz 11, 16(3)
+; LE-32BIT-NEXT:    lwz 12, 20(3)
+; LE-32BIT-NEXT:    lwz 0, 24(3)
+; LE-32BIT-NEXT:    lwz 3, 28(3)
+; LE-32BIT-NEXT:    lwz 4, 28(4)
+; LE-32BIT-NEXT:    stw 6, 76(1)
+; LE-32BIT-NEXT:    stw 6, 72(1)
+; LE-32BIT-NEXT:    rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT:    stw 6, 68(1)
+; LE-32BIT-NEXT:    stw 6, 64(1)
+; LE-32BIT-NEXT:    stw 6, 60(1)
+; LE-32BIT-NEXT:    stw 6, 56(1)
+; LE-32BIT-NEXT:    stw 6, 52(1)
+; LE-32BIT-NEXT:    stw 6, 48(1)
+; LE-32BIT-NEXT:    stw 3, 44(1)
+; LE-32BIT-NEXT:    addi 3, 1, 16
+; LE-32BIT-NEXT:    stw 0, 40(1)
+; LE-32BIT-NEXT:    stw 12, 36(1)
+; LE-32BIT-NEXT:    stw 11, 32(1)
+; LE-32BIT-NEXT:    stw 10, 28(1)
+; LE-32BIT-NEXT:    stw 9, 24(1)
+; LE-32BIT-NEXT:    stw 8, 20(1)
+; LE-32BIT-NEXT:    stw 7, 16(1)
+; LE-32BIT-NEXT:    lwzux 3, 4, 3
+; LE-32BIT-NEXT:    lwz 6, 12(4)
+; LE-32BIT-NEXT:    lwz 7, 8(4)
+; LE-32BIT-NEXT:    lwz 8, 20(4)
+; LE-32BIT-NEXT:    lwz 9, 16(4)
+; LE-32BIT-NEXT:    lwz 10, 28(4)
+; LE-32BIT-NEXT:    lwz 11, 24(4)
+; LE-32BIT-NEXT:    ori 4, 4, 4
+; LE-32BIT-NEXT:    lwz 4, 0(4)
+; LE-32BIT-NEXT:    stw 3, 0(5)
+; LE-32BIT-NEXT:    stw 11, 24(5)
+; LE-32BIT-NEXT:    stw 10, 28(5)
+; LE-32BIT-NEXT:    stw 9, 16(5)
+; LE-32BIT-NEXT:    stw 8, 20(5)
+; LE-32BIT-NEXT:    stw 7, 8(5)
+; LE-32BIT-NEXT:    stw 6, 12(5)
+; LE-32BIT-NEXT:    stw 4, 4(5)
+; LE-32BIT-NEXT:    addi 1, 1, 80
+; LE-32BIT-NEXT:    blr
+  %src = load i256, ptr %src.ptr, align 1
+  %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+  %bitOff = shl i256 %dwordOff, 6
   %res = shl i256 %src, %bitOff
   store i256 %res, ptr %dst, align 1
   ret void
 }
+
+
 define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-64BIT-LABEL: ashr_32bytes:
 ; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    ld 6, 24(3)
 ; LE-64BIT-NEXT:    lxvd2x 0, 0, 3
-; LE-64BIT-NEXT:    ld 6, 16(3)
-; LE-64BIT-NEXT:    ld 3, 24(3)
+; LE-64BIT-NEXT:    lwz 4, 0(4)
 ; LE-64BIT-NEXT:    addi 7, 1, -64
+; LE-64BIT-NEXT:    ld 3, 16(3)
+; LE-64BIT-NEXT:    sradi 8, 6, 63
+; LE-64BIT-NEXT:    rlwinm 9, 4, 0, 27, 28
+; LE-64BIT-NEXT:    stxvd2x 0, 0, 7
+; LE-64BIT-NEXT:    std 6, -40(1)
+; LE-64BIT-NEXT:    std 3, -48(1)
+; LE-64BIT-NEXT:    std 8, -8(1)
+; LE-64BIT-NEXT:    std 8, -16(1)
+; LE-64BIT-NEXT:    std 8, -24(1)
+; LE-64BIT-NEXT:    std 8, -32(1)
+; LE-64BIT-NEXT:    rlwinm 3, 4, 3, 26, 28
+; LE-64BIT-NEXT:    ldux 4, 9, 7
+; LE-64BIT-NEXT:    ld 7, 8(9)
+; LE-64BIT-NEXT:    subfic 6, 3, 64
+; LE-64BIT-NEXT:    ld 8, 16(9)
+; LE-64BIT-NEXT:    ld 9, 24(9)
+; LE-64BIT-NEXT:    srd 4, 4, 3
+; LE-64BIT-NEXT:    sld 10, 7, 6
+; LE-64BIT-NEXT:    sld 11, 9, 6
+; LE-64BIT-NEXT:    srd 7, 7, 3
+; LE-64BIT-NEXT:    sld 6, 8, 6
+; LE-64BIT-NEXT:    or 4, 10, 4
+; LE-64BIT-NEXT:    srd 10, 8, 3
+; LE-64BIT-NEXT:    srad 3, 9, 3
+; LE-64BIT-NEXT:    or 6, 6, 7
+; LE-64BIT-NEXT:    std 3, 24(5)
+; LE-64BIT-NEXT:    or 3, 11, 10
+; LE-64BIT-NEXT:    std 6, 8(5)
+; LE-64BIT-NEXT:    std 4, 0(5)
+; LE-64BIT-NEXT:    std 3, 16(5)
+; LE-64BIT-NEXT:    blr
+;
+; BE-LABEL: ashr_32bytes:
+; BE:       # %bb.0:
+; BE-NEXT:    ld 7, 0(3)
+; BE-NEXT:    ld 8, 8(3)
+; BE-NEXT:    ld 9, 16(3)
+; BE-NEXT:    ld 3, 24(3)
+; BE-NEXT:    lwz 4, 28(4)
+; BE-NEXT:    addi 6, 1, -32
+; BE-NEXT:    std 3, -8(1)
+; BE-NEXT:    std 7, -32(1)
+; BE-NEXT:    sradi 3, 7, 63
+; BE-NEXT:    rlwinm 7, 4, 0, 27, 28
+; BE-NEXT:    std 3, -40(1)
+; BE-NEXT:    std 3, -48(1)
+; BE-NEXT:    std 3, -56(1)
+; BE-NEXT:    std 3, -64(1)
+; BE-NEXT:    neg 3, 7
+; BE-NEXT:    std 9, -16(1)
+; BE-NEXT:    std 8, -24(1)
+; BE-NEXT:    extsw 3, 3
+; BE-NEXT:    ldux 3, 6, 3
+; BE-NEXT:    rlwinm 4, 4, 3, 26, 28
+; BE-NEXT:    subfic 9, 4, 64
+; BE-NEXT:    ld 7, 8(6)
+; BE-NEXT:    ld 8, 24(6)
+; BE-NEXT:    ld 6, 16(6)
+; BE-NEXT:    sld 10, 3, 9
+; BE-NEXT:    srad 3, 3, 4
+; BE-NEXT:    std 3, 0(5)
+; BE-NEXT:    srd 11, 7, 4
+; BE-NEXT:    srd 8, 8, 4
+; BE-NEXT:    sld 7, 7, 9
+; BE-NEXT:    sld 9, 6, 9
+; BE-NEXT:    srd 6, 6, 4
+; BE-NEXT:    or 10, 10, 11
+; BE-NEXT:    or 8, 9, 8
+; BE-NEXT:    or 6, 7, 6
+; BE-NEXT:    std 6, 16(5)
+; BE-NEXT:    std 8, 24(5)
+; BE-NEXT:    std 10, 8(5)
+; BE-NEXT:    blr
+;
+; LE-32BIT-LABEL: ashr_32bytes:
+; LE-32BIT:       # %bb.0:
+; LE-32BIT-NEXT:    stwu 1, -112(1)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    addi 6, 1, 48
+; LE-32BIT-NEXT:    lwz 8, 4(3)
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 10, 12(3)
+; LE-32BIT-NEXT:    lwz 11, 16(3)
+; LE-32BIT-NEXT:    lwz 12, 20(3)
+; LE-32BIT-NEXT:    lwz 0, 24(3)
+; LE-32BIT-NEXT:    lwz 3, 28(3)
+; LE-32BIT-NEXT:    lwz 4, 28(4)
+; LE-32BIT-NEXT:    stw 3, 76(1)
+; LE-32BIT-NEXT:    srawi 3, 7, 31
+; LE-32BIT-NEXT:    stw 7, 48(1)
+; LE-32BIT-NEXT:    rlwinm 7, 4, 0, 27, 29
+; LE-32BIT-NEXT:    stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT:    stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 27, 92(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 28, 96(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 29, 100(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 30, 104(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 0, 72(1)
+; LE-32BIT-NEXT:    subfic 0, 4, 32
+; LE-32BIT-NEXT:    stw 12, 68(1)
+; LE-32BIT-NEXT:    stw 11, 64(1)
+; LE-32BIT-NEXT:    stw 10, 60(1)
+; LE-32BIT-NEXT:    stw 9, 56(1)
+; LE-32BIT-NEXT:    stw 8, 52(1)
+; LE-32BIT-NEXT:    stw 3, 44(1)
+; LE-32BIT-NEXT:    stw 3, 40(1)
+; LE-32BIT-NEXT:    stw 3, 36(1)
+; LE-32BIT-NEXT:    stw 3, 32(1)
+; LE-32BIT-NEXT:    stw 3, 28(1)
+; LE-32BIT-NEXT:    stw 3, 24(1)
+; LE-32BIT-NEXT:    stw 3, 20(1)
+; LE-32BIT-NEXT:    stw 3, 16(1)
+; LE-32BIT-NEXT:    sub 3, 6, 7
+; LE-32BIT-NEXT:    lwz 6, 4(3)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    lwz 8, 12(3)
+; LE-32BIT-NEXT:    srw 30, 6, 4
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    slw 29, 7, 0
+; LE-32BIT-NEXT:    lwz 10, 20(3)
+; LE-32BIT-NEXT:    srw 28, 8, 4
+; LE-32BIT-NEXT:    lwz 11, 16(3)
+; LE-32BIT-NEXT:    slw 27, 9, 0
+; LE-32BIT-NEXT:    lwz 12, 28(3)
+; LE-32BIT-NEXT:    slw 6, 6, 0
+; LE-32BIT-NEXT:    lwz 3, 24(3)
+; LE-32BIT-NEXT:    srw 26, 10, 4
+; LE-32BIT-NEXT:    slw 25, 11, 0
+; LE-32BIT-NEXT:    slw 8, 8, 0
+; LE-32BIT-NEXT:    slw 10, 10, 0
+; LE-32BIT-NEXT:    slw 0, 3, 0
+; LE-32BIT-NEXT:    srw 3, 3, 4
+; LE-32BIT-NEXT:    srw 12, 12, 4
+; LE-32BIT-NEXT:    or 3, 10, 3
+; LE-32BIT-NEXT:    srw 11, 11, 4
+; LE-32BIT-NEXT:    stw 3, 24(5)
+; LE-32BIT-NEXT:    or 3, 0, 12
+; LE-32BIT-NEXT:    stw 3, 28(5)
+; LE-32BIT-NEXT:    or 3, 8, 11
+; LE-32BIT-NEXT:    srw 9, 9, 4
+; LE-32BIT-NEXT:    stw 3, 16(5)
+; LE-32BIT-NEXT:    or 3, 25, 26
+; LE-32BIT-NEXT:    stw 3, 20(5)
+; LE-32BIT-NEXT:    or 3, 6, 9
+; LE-32BIT-NEXT:    stw 3, 8(5)
+; LE-32BIT-NEXT:    or 3, 27, 28
+; LE-32BIT-NEXT:    sraw 4, 7, 4
+; LE-32BIT-NEXT:    stw 3, 12(5)
+; LE-32BIT-NEXT:    or 3, 29, 30
+; LE-32BIT-NEXT:    stw 4, 0(5)
+; LE-32BIT-NEXT:    stw 3, 4(5)
+; LE-32BIT-NEXT:    lwz 30, 104(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 29, 100(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 28, 96(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 27, 92(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 26, 88(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 25, 84(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    addi 1, 1, 112
+; LE-32BIT-NEXT:    blr
+  %src = load i256, ptr %src.ptr, align 1
+  %byteOff = load i256, ptr %byteOff.ptr, align 1
+  %bitOff = shl i256 %byteOff, 3
+  %res = ashr i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: ashr_32bytes_wordOff:
+; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    ld 6, 24(3)
+; LE-64BIT-NEXT:    lxvd2x 0, 0, 3
 ; LE-64BIT-NEXT:    lwz 4, 0(4)
-; LE-64BIT-NEXT:    li 8, 16
-; LE-64BIT-NEXT:    std 3, 24(7)
-; LE-64BIT-NEXT:    sradi 3, 3, 63
-; LE-64BIT-NEXT:    std 6, 16(7)
-; LE-64BIT-NEXT:    std 3, 56(7)
-; LE-64BIT-NEXT:    std 3, 48(7)
-; LE-64BIT-NEXT:    std 3, 40(7)
-; LE-64BIT-NEXT:    std 3, 32(7)
-; LE-64BIT-NEXT:    clrldi 3, 4, 59
+; LE-64BIT-NEXT:    addi 7, 1, -64
+; LE-64BIT-NEXT:    ld 3, 16(3)
+; LE-64BIT-NEXT:    sradi 8, 6, 63
+; LE-64BIT-NEXT:    rlwinm 9, 4, 2, 27, 28
 ; LE-64BIT-NEXT:    stxvd2x 0, 0, 7
-; LE-64BIT-NEXT:    lxvd2x 0, 7, 3
-; LE-64BIT-NEXT:    add 3, 7, 3
-; LE-64BIT-NEXT:    lxvd2x 1, 3, 8
-; LE-64BIT-NEXT:    stxvd2x 1, 5, 8
+; LE-64BIT-NEXT:    std 6, -40(1)
+; LE-64BIT-NEXT:    std 3, -48(1)
+; LE-64BIT-NEXT:    std 8, -8(1)
+; LE-64BIT-NEXT:    std 8, -16(1)
+; LE-64BIT-NEXT:    std 8, -24(1)
+; LE-64BIT-NEXT:    std 8, -32(1)
+; LE-64BIT-NEXT:    rlwinm 3, 4, 5, 26, 26
+; LE-64BIT-NEXT:    ldux 4, 9, 7
+; LE-64BIT-NEXT:    ld 7, 8(9)
+; LE-64BIT-NEXT:    subfic 6, 3, 64
+; LE-64BIT-NEXT:    ld 8, 16(9)
+; LE-64BIT-NEXT:    ld 9, 24(9)
+; LE-64BIT-NEXT:    srd 4, 4, 3
+; LE-64BIT-NEXT:    sld 10, 7, 6
+; LE-64BIT-NEXT:    sld 11, 9, 6
+; LE-64BIT-NEXT:    srd 7, 7, 3
+; LE-64BIT-NEXT:    sld 6, 8, 6
+; LE-64BIT-NEXT:    or 4, 10, 4
+; LE-64BIT-NEXT:    srd 10, 8, 3
+; LE-64BIT-NEXT:    srad 3, 9, 3
+; LE-64BIT-NEXT:    or 6, 6, 7
+; LE-64BIT-NEXT:    std 3, 24(5)
+; LE-64BIT-NEXT:    or 3, 11, 10
+; LE-64BIT-NEXT:    std 6, 8(5)
+; LE-64BIT-NEXT:    std 4, 0(5)
+; LE-64BIT-NEXT:    std 3, 16(5)
+; LE-64BIT-NEXT:    blr
+;
+; BE-LABEL: ashr_32bytes_wordOff:
+; BE:       # %bb.0:
+; BE-NEXT:    ld 7, 0(3)
+; BE-NEXT:    ld 8, 8(3)
+; BE-NEXT:    ld 9, 16(3)
+; BE-NEXT:    ld 3, 24(3)
+; BE-NEXT:    lwz 4, 28(4)
+; BE-NEXT:    addi 6, 1, -32
+; BE-NEXT:    std 3, -8(1)
+; BE-NEXT:    std 7, -32(1)
+; BE-NEXT:    sradi 3, 7, 63
+; BE-NEXT:    rlwinm 7, 4, 2, 27, 28
+; BE-NEXT:    std 3, -40(1)
+; BE-NEXT:    std 3, -48(1)
+; BE-NEXT:    std 3, -56(1)
+; BE-NEXT:    std 3, -64(1)
+; BE-NEXT:    neg 3, 7
+; BE-NEXT:    std 9, -16(1)
+; BE-NEXT:    std 8, -24(1)
+; BE-NEXT:    extsw 3, 3
+; BE-NEXT:    ldux 3, 6, 3
+; BE-NEXT:    rlwinm 4, 4, 5, 26, 26
+; BE-NEXT:    subfic 9, 4, 64
+; BE-NEXT:    ld 7, 8(6)
+; BE-NEXT:    ld 8, 24(6)
+; BE-NEXT:    ld 6, 16(6)
+; BE-NEXT:    sld 10, 3, 9
+; BE-NEXT:    srad 3, 3, 4
+; BE-NEXT:    std 3, 0(5)
+; BE-NEXT:    srd 11, 7, 4
+; BE-NEXT:    srd 8, 8, 4
+; BE-NEXT:    sld 7, 7, 9
+; BE-NEXT:    sld 9, 6, 9
+; BE-NEXT:    srd 6, 6, 4
+; BE-NEXT:    or 10, 10, 11
+; BE-NEXT:    or 8, 9, 8
+; BE-NEXT:    or 6, 7, 6
+; BE-NEXT:    std 6, 16(5)
+; BE-NEXT:    std 8, 24(5)
+; BE-NEXT:    std 10, 8(5)
+; BE-NEXT:    blr
+;
+; LE-32BIT-LABEL: ashr_32bytes_wordOff:
+; LE-32BIT:       # %bb.0:
+; LE-32BIT-NEXT:    stwu 1, -80(1)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    addi 6, 1, 48
+; LE-32BIT-NEXT:    lwz 8, 4(3)
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    lwz 10, 12(3)
+; LE-32BIT-NEXT:    lwz 11, 16(3)
+; LE-32BIT-NEXT:    lwz 12, 20(3)
+; LE-32BIT-NEXT:    lwz 0, 24(3)
+; LE-32BIT-NEXT:    lwz 3, 28(3)
+; LE-32BIT-NEXT:    lwz 4, 28(4)
+; LE-32BIT-NEXT:    stw 3, 76(1)
+; LE-32BIT-NEXT:    srawi 3, 7, 31
+; LE-32BIT-NEXT:    rlwinm 4, 4, 2, 27, 29
+; LE-32BIT-NEXT:    stw 0, 72(1)
+; LE-32BIT-NEXT:    stw 12, 68(1)
+; LE-32BIT-NEXT:    stw 11, 64(1)
+; LE-32BIT-NEXT:    stw 10, 60(1)
+; LE-32BIT-NEXT:    stw 9, 56(1)
+; LE-32BIT-NEXT:    stw 8, 52(1)
+; LE-32BIT-NEXT:    stw 7, 48(1)
+; LE-32BIT-NEXT:    stw 3, 44(1)
+; LE-32BIT-NEXT:    stw 3, 40(1)
+; LE-32BIT-NEXT:    stw 3, 36(1)
+; LE-32BIT-NEXT:    stw 3, 32(1)
+; LE-32BIT-NEXT:    stw 3, 28(1)
+; LE-32BIT-NEXT:    stw 3, 24(1)
+; LE-32BIT-NEXT:    stw 3, 20(1)
+; LE-32BIT-NEXT:    stw 3, 16(1)
+; LE-32BIT-NEXT:    sub 3, 6, 4
+; LE-32BIT-NEXT:    lwz 4, 4(3)
+; LE-32BIT-NEXT:    lwz 6, 0(3)
+; LE-32BIT-NEXT:    lwz 7, 12(3)
+; LE-32BIT-NEXT:    lwz 8, 8(3)
+; LE-32BIT-NEXT:    lwz 9, 20(3)
+; LE-32BIT-NEXT:    lwz 10, 16(3)
+; LE-32BIT-NEXT:    lwz 11, 24(3)
+; LE-32BIT-NEXT:    lwz 3, 28(3)
+; LE-32BIT-NEXT:    stw 11, 24(5)
+; LE-32BIT-NEXT:    stw 3, 28(5)
+; LE-32BIT-NEXT:    stw 10, 16(5)
+; LE-32BIT-NEXT:    stw 9, 20(5)
+; LE-32BIT-NEXT:    stw 8, 8(5)
+; LE-32BIT-NEXT:    stw 7, 12(5)
+; LE-32BIT-NEXT:    stw 6, 0(5)
+; LE-32BIT-NEXT:    stw 4, 4(5)
+; LE-32BIT-NEXT:    addi 1, 1, 80
+; LE-32BIT-NEXT:    blr
+  %src = load i256, ptr %src.ptr, align 1
+  %wordOff = load i256, ptr %wordOff.ptr, align 1
+  %bitOff = shl i256 %wordOff, 5
+  %res = ashr i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: ashr_32bytes_dwordOff:
+; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    lxvd2x 0, 0, 3
+; LE-64BIT-NEXT:    ld 6, 16(3)
+; LE-64BIT-NEXT:    ld 7, 24(3)
+; LE-64BIT-NEXT:    lwz 3, 0(4)
+; LE-64BIT-NEXT:    addi 4, 1, -64
+; LE-64BIT-NEXT:    rlwinm 3, 3, 3, 27, 28
+; LE-64BIT-NEXT:    stxvd2x 0, 0, 4
+; LE-64BIT-NEXT:    std 6, -48(1)
+; LE-64BIT-NEXT:    sradi 6, 7, 63
+; LE-64BIT-NEXT:    std 7, -40(1)
+; LE-64BIT-NEXT:    std 6, -8(1)
+; LE-64BIT-NEXT:    std 6, -16(1)
+; LE-64BIT-NEXT:    std 6, -24(1)
+; LE-64BIT-NEXT:    std 6, -32(1)
+; LE-64BIT-NEXT:    lxvd2x 0, 4, 3
+; LE-64BIT-NEXT:    add 3, 4, 3
+; LE-64BIT-NEXT:    li 4, 16
+; LE-64BIT-NEXT:    lxvd2x 1, 3, 4
+; LE-64BIT-NEXT:    stxvd2x 1, 5, 4
 ; LE-64BIT-NEXT:    stxvd2x 0, 0, 5
 ; LE-64BIT-NEXT:    blr
 ;
-; BE-LABEL: ashr_32bytes:
+; BE-LABEL: ashr_32bytes_dwordOff:
 ; BE:       # %bb.0:
 ; BE-NEXT:    ld 7, 0(3)
 ; BE-NEXT:    ld 8, 8(3)
 ; BE-NEXT:    ld 9, 16(3)
 ; BE-NEXT:    ld 3, 24(3)
 ; BE-NEXT:    lwz 4, 28(4)
-; BE-NEXT:    addi 6, 1, -64
-; BE-NEXT:    std 3, 56(6)
+; BE-NEXT:    addi 6, 1, -32
+; BE-NEXT:    std 3, -8(1)
 ; BE-NEXT:    sradi 3, 7, 63
-; BE-NEXT:    clrlwi 4, 4, 27
-; BE-NEXT:    std 3, 24(6)
-; BE-NEXT:    std 3, 16(6)
-; BE-NEXT:    std 3, 8(6)
+; BE-NEXT:    rlwinm 4, 4, 3, 27, 28
+; BE-NEXT:    std 3, -40(1)
+; BE-NEXT:    std 3, -48(1)
+; BE-NEXT:    std 3, -56(1)
 ; BE-NEXT:    std 3, -64(1)
 ; BE-NEXT:    neg 3, 4
-; BE-NEXT:    std 9, 48(6)
-; BE-NEXT:    std 8, 40(6)
-; BE-NEXT:    std 7, 32(6)
+; BE-NEXT:    std 9, -16(1)
+; BE-NEXT:    std 8, -24(1)
+; BE-NEXT:    std 7, -32(1)
 ; BE-NEXT:    extsw 3, 3
-; BE-NEXT:    addi 4, 1, -32
-; BE-NEXT:    ldux 3, 4, 3
-; BE-NEXT:    ld 6, 8(4)
-; BE-NEXT:    ld 7, 24(4)
-; BE-NEXT:    ld 4, 16(4)
+; BE-NEXT:    ldux 3, 6, 3
+; BE-NEXT:    ld 4, 8(6)
+; BE-NEXT:    ld 7, 24(6)
+; BE-NEXT:    ld 6, 16(6)
 ; BE-NEXT:    std 3, 0(5)
-; BE-NEXT:    std 4, 16(5)
+; BE-NEXT:    std 6, 16(5)
 ; BE-NEXT:    std 7, 24(5)
-; BE-NEXT:    std 6, 8(5)
+; BE-NEXT:    std 4, 8(5)
 ; BE-NEXT:    blr
 ;
-; LE-32BIT-LABEL: ashr_32bytes:
+; LE-32BIT-LABEL: ashr_32bytes_dwordOff:
 ; LE-32BIT:       # %bb.0:
 ; LE-32BIT-NEXT:    stwu 1, -80(1)
 ; LE-32BIT-NEXT:    lwz 7, 0(3)
@@ -707,7 +1915,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    lwz 4, 28(4)
 ; LE-32BIT-NEXT:    stw 3, 76(1)
 ; LE-32BIT-NEXT:    srawi 3, 7, 31
-; LE-32BIT-NEXT:    clrlwi 4, 4, 27
+; LE-32BIT-NEXT:    rlwinm 4, 4, 3, 27, 28
 ; LE-32BIT-NEXT:    stw 0, 72(1)
 ; LE-32BIT-NEXT:    stw 12, 68(1)
 ; LE-32BIT-NEXT:    stw 11, 64(1)
@@ -743,11 +1951,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    addi 1, 1, 80
 ; LE-32BIT-NEXT:    blr
   %src = load i256, ptr %src.ptr, align 1
-  %byteOff = load i256, ptr %byteOff.ptr, align 1
-  %bitOff = shl i256 %byteOff, 3
+  %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+  %bitOff = shl i256 %dwordOff, 6
   %res = ashr i256 %src, %bitOff
   store i256 %res, ptr %dst, align 1
   ret void
 }
+
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; LE: {{.*}}

diff  --git a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
index 044ddf562294c8..8e69547df6fcc1 100644
--- a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
@@ -209,45 +209,41 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    stwu 1, -48(1)
 ; LE-32BIT-NEXT:    lwz 7, 0(3)
 ; LE-32BIT-NEXT:    li 6, 0
-; LE-32BIT-NEXT:    lwz 4, 12(4)
 ; LE-32BIT-NEXT:    lwz 8, 4(3)
 ; LE-32BIT-NEXT:    lwz 9, 8(3)
 ; LE-32BIT-NEXT:    lwz 3, 12(3)
+; LE-32BIT-NEXT:    lwz 4, 12(4)
 ; LE-32BIT-NEXT:    stw 6, 28(1)
 ; LE-32BIT-NEXT:    stw 6, 24(1)
 ; LE-32BIT-NEXT:    stw 6, 20(1)
 ; LE-32BIT-NEXT:    stw 6, 16(1)
-; LE-32BIT-NEXT:    addi 6, 1, 32
-; LE-32BIT-NEXT:    stw 7, 32(1)
-; LE-32BIT-NEXT:    rlwinm 7, 4, 29, 28, 31
+; LE-32BIT-NEXT:    rlwinm 6, 4, 29, 28, 29
 ; LE-32BIT-NEXT:    stw 3, 44(1)
-; LE-32BIT-NEXT:    sub 6, 6, 7
+; LE-32BIT-NEXT:    addi 3, 1, 32
 ; LE-32BIT-NEXT:    stw 9, 40(1)
-; LE-32BIT-NEXT:    li 3, 7
+; LE-32BIT-NEXT:    sub 3, 3, 6
 ; LE-32BIT-NEXT:    stw 8, 36(1)
-; LE-32BIT-NEXT:    nand 3, 4, 3
-; LE-32BIT-NEXT:    lwz 7, 4(6)
-; LE-32BIT-NEXT:    clrlwi 4, 4, 29
-; LE-32BIT-NEXT:    lwz 8, 8(6)
-; LE-32BIT-NEXT:    subfic 10, 4, 32
-; LE-32BIT-NEXT:    lwz 9, 0(6)
-; LE-32BIT-NEXT:    clrlwi 3, 3, 27
-; LE-32BIT-NEXT:    lwz 6, 12(6)
-; LE-32BIT-NEXT:    srw 11, 8, 4
-; LE-32BIT-NEXT:    slw 8, 8, 10
-; LE-32BIT-NEXT:    slw 10, 9, 10
-; LE-32BIT-NEXT:    srw 6, 6, 4
-; LE-32BIT-NEXT:    srw 9, 9, 4
-; LE-32BIT-NEXT:    srw 4, 7, 4
-; LE-32BIT-NEXT:    slwi 7, 7, 1
-; LE-32BIT-NEXT:    slw 3, 7, 3
-; LE-32BIT-NEXT:    or 6, 8, 6
-; LE-32BIT-NEXT:    or 4, 10, 4
-; LE-32BIT-NEXT:    or 3, 11, 3
-; LE-32BIT-NEXT:    stw 9, 0(5)
-; LE-32BIT-NEXT:    stw 6, 12(5)
-; LE-32BIT-NEXT:    stw 4, 4(5)
+; LE-32BIT-NEXT:    clrlwi 4, 4, 27
+; LE-32BIT-NEXT:    stw 7, 32(1)
+; LE-32BIT-NEXT:    subfic 9, 4, 32
+; LE-32BIT-NEXT:    lwz 6, 4(3)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    lwz 8, 12(3)
+; LE-32BIT-NEXT:    srw 10, 6, 4
+; LE-32BIT-NEXT:    lwz 3, 8(3)
+; LE-32BIT-NEXT:    slw 11, 7, 9
+; LE-32BIT-NEXT:    slw 6, 6, 9
+; LE-32BIT-NEXT:    srw 8, 8, 4
+; LE-32BIT-NEXT:    slw 9, 3, 9
+; LE-32BIT-NEXT:    srw 3, 3, 4
+; LE-32BIT-NEXT:    or 3, 6, 3
 ; LE-32BIT-NEXT:    stw 3, 8(5)
+; LE-32BIT-NEXT:    or 3, 9, 8
+; LE-32BIT-NEXT:    srw 4, 7, 4
+; LE-32BIT-NEXT:    stw 3, 12(5)
+; LE-32BIT-NEXT:    or 3, 11, 10
+; LE-32BIT-NEXT:    stw 4, 0(5)
+; LE-32BIT-NEXT:    stw 3, 4(5)
 ; LE-32BIT-NEXT:    addi 1, 1, 48
 ; LE-32BIT-NEXT:    blr
   %src = load i128, ptr %src.ptr, align 1
@@ -304,34 +300,30 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    stw 6, 40(1)
 ; LE-32BIT-NEXT:    stw 6, 36(1)
 ; LE-32BIT-NEXT:    stw 6, 32(1)
-; LE-32BIT-NEXT:    rlwinm 6, 4, 29, 28, 31
+; LE-32BIT-NEXT:    rlwinm 6, 4, 29, 28, 29
 ; LE-32BIT-NEXT:    stw 3, 28(1)
 ; LE-32BIT-NEXT:    addi 3, 1, 16
 ; LE-32BIT-NEXT:    stw 9, 24(1)
+; LE-32BIT-NEXT:    clrlwi 4, 4, 27
 ; LE-32BIT-NEXT:    stw 8, 20(1)
+; LE-32BIT-NEXT:    subfic 8, 4, 32
 ; LE-32BIT-NEXT:    stw 7, 16(1)
-; LE-32BIT-NEXT:    li 7, 7
 ; LE-32BIT-NEXT:    lwzux 3, 6, 3
-; LE-32BIT-NEXT:    nand 7, 4, 7
-; LE-32BIT-NEXT:    clrlwi 4, 4, 29
-; LE-32BIT-NEXT:    subfic 10, 4, 32
-; LE-32BIT-NEXT:    lwz 8, 8(6)
-; LE-32BIT-NEXT:    clrlwi 7, 7, 27
 ; LE-32BIT-NEXT:    lwz 9, 4(6)
 ; LE-32BIT-NEXT:    slw 3, 3, 4
+; LE-32BIT-NEXT:    lwz 7, 8(6)
 ; LE-32BIT-NEXT:    lwz 6, 12(6)
 ; LE-32BIT-NEXT:    slw 11, 9, 4
-; LE-32BIT-NEXT:    srw 9, 9, 10
-; LE-32BIT-NEXT:    srw 10, 6, 10
-; LE-32BIT-NEXT:    slw 6, 6, 4
-; LE-32BIT-NEXT:    slw 4, 8, 4
-; LE-32BIT-NEXT:    srwi 8, 8, 1
-; LE-32BIT-NEXT:    srw 7, 8, 7
+; LE-32BIT-NEXT:    srw 9, 9, 8
+; LE-32BIT-NEXT:    srw 10, 7, 8
+; LE-32BIT-NEXT:    srw 8, 6, 8
+; LE-32BIT-NEXT:    slw 7, 7, 4
+; LE-32BIT-NEXT:    slw 4, 6, 4
 ; LE-32BIT-NEXT:    or 3, 3, 9
-; LE-32BIT-NEXT:    or 4, 4, 10
+; LE-32BIT-NEXT:    stw 4, 12(5)
+; LE-32BIT-NEXT:    or 4, 7, 8
 ; LE-32BIT-NEXT:    stw 3, 0(5)
-; LE-32BIT-NEXT:    or 3, 11, 7
-; LE-32BIT-NEXT:    stw 6, 12(5)
+; LE-32BIT-NEXT:    or 3, 11, 10
 ; LE-32BIT-NEXT:    stw 4, 8(5)
 ; LE-32BIT-NEXT:    stw 3, 4(5)
 ; LE-32BIT-NEXT:    addi 1, 1, 48
@@ -387,46 +379,42 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT:       # %bb.0:
 ; LE-32BIT-NEXT:    stwu 1, -48(1)
 ; LE-32BIT-NEXT:    lwz 7, 0(3)
-; LE-32BIT-NEXT:    li 6, 7
+; LE-32BIT-NEXT:    addi 6, 1, 32
 ; LE-32BIT-NEXT:    lwz 8, 4(3)
 ; LE-32BIT-NEXT:    lwz 9, 8(3)
 ; LE-32BIT-NEXT:    lwz 3, 12(3)
 ; LE-32BIT-NEXT:    lwz 4, 12(4)
 ; LE-32BIT-NEXT:    stw 3, 44(1)
 ; LE-32BIT-NEXT:    srawi 3, 7, 31
-; LE-32BIT-NEXT:    stw 8, 36(1)
-; LE-32BIT-NEXT:    rlwinm 8, 4, 29, 28, 31
 ; LE-32BIT-NEXT:    stw 7, 32(1)
-; LE-32BIT-NEXT:    addi 7, 1, 32
+; LE-32BIT-NEXT:    rlwinm 7, 4, 29, 28, 29
 ; LE-32BIT-NEXT:    stw 9, 40(1)
-; LE-32BIT-NEXT:    nand 6, 4, 6
+; LE-32BIT-NEXT:    clrlwi 4, 4, 27
+; LE-32BIT-NEXT:    stw 8, 36(1)
+; LE-32BIT-NEXT:    subfic 9, 4, 32
 ; LE-32BIT-NEXT:    stw 3, 28(1)
-; LE-32BIT-NEXT:    clrlwi 4, 4, 29
 ; LE-32BIT-NEXT:    stw 3, 24(1)
-; LE-32BIT-NEXT:    subfic 10, 4, 32
 ; LE-32BIT-NEXT:    stw 3, 20(1)
-; LE-32BIT-NEXT:    clrlwi 6, 6, 27
 ; LE-32BIT-NEXT:    stw 3, 16(1)
-; LE-32BIT-NEXT:    sub 3, 7, 8
-; LE-32BIT-NEXT:    lwz 7, 4(3)
-; LE-32BIT-NEXT:    lwz 8, 8(3)
-; LE-32BIT-NEXT:    lwz 9, 0(3)
-; LE-32BIT-NEXT:    lwz 3, 12(3)
-; LE-32BIT-NEXT:    srw 11, 8, 4
-; LE-32BIT-NEXT:    slw 8, 8, 10
-; LE-32BIT-NEXT:    slw 10, 9, 10
+; LE-32BIT-NEXT:    sub 3, 6, 7
+; LE-32BIT-NEXT:    lwz 6, 4(3)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    lwz 8, 12(3)
+; LE-32BIT-NEXT:    srw 10, 6, 4
+; LE-32BIT-NEXT:    lwz 3, 8(3)
+; LE-32BIT-NEXT:    slw 11, 7, 9
+; LE-32BIT-NEXT:    slw 6, 6, 9
+; LE-32BIT-NEXT:    srw 8, 8, 4
+; LE-32BIT-NEXT:    slw 9, 3, 9
 ; LE-32BIT-NEXT:    srw 3, 3, 4
-; LE-32BIT-NEXT:    sraw 9, 9, 4
-; LE-32BIT-NEXT:    srw 4, 7, 4
-; LE-32BIT-NEXT:    slwi 7, 7, 1
-; LE-32BIT-NEXT:    or 3, 8, 3
-; LE-32BIT-NEXT:    slw 6, 7, 6
+; LE-32BIT-NEXT:    or 3, 6, 3
+; LE-32BIT-NEXT:    stw 3, 8(5)
+; LE-32BIT-NEXT:    or 3, 9, 8
+; LE-32BIT-NEXT:    sraw 4, 7, 4
 ; LE-32BIT-NEXT:    stw 3, 12(5)
-; LE-32BIT-NEXT:    or 3, 10, 4
+; LE-32BIT-NEXT:    or 3, 11, 10
+; LE-32BIT-NEXT:    stw 4, 0(5)
 ; LE-32BIT-NEXT:    stw 3, 4(5)
-; LE-32BIT-NEXT:    or 3, 11, 6
-; LE-32BIT-NEXT:    stw 9, 0(5)
-; LE-32BIT-NEXT:    stw 3, 8(5)
 ; LE-32BIT-NEXT:    addi 1, 1, 48
 ; LE-32BIT-NEXT:    blr
   %src = load i128, ptr %src.ptr, align 1
@@ -449,32 +437,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-64BIT-NEXT:    li 4, 48
 ; LE-64BIT-NEXT:    stxvd2x 2, 7, 4
 ; LE-64BIT-NEXT:    stxvd2x 2, 7, 8
-; LE-64BIT-NEXT:    rlwinm 4, 3, 29, 27, 31
+; LE-64BIT-NEXT:    rlwinm 4, 3, 29, 27, 28
+; LE-64BIT-NEXT:    clrlwi 3, 3, 26
 ; LE-64BIT-NEXT:    stxvd2x 0, 7, 6
 ; LE-64BIT-NEXT:    stxvd2x 1, 0, 7
-; LE-64BIT-NEXT:    li 6, 7
-; LE-64BIT-NEXT:    ldux 7, 4, 7
-; LE-64BIT-NEXT:    ld 8, 16(4)
-; LE-64BIT-NEXT:    nand 6, 3, 6
+; LE-64BIT-NEXT:    xori 8, 3, 63
+; LE-64BIT-NEXT:    ldux 6, 4, 7
+; LE-64BIT-NEXT:    ld 7, 16(4)
 ; LE-64BIT-NEXT:    ld 9, 8(4)
-; LE-64BIT-NEXT:    clrlwi 3, 3, 29
 ; LE-64BIT-NEXT:    ld 4, 24(4)
-; LE-64BIT-NEXT:    clrlwi 6, 6, 26
+; LE-64BIT-NEXT:    srd 6, 6, 3
+; LE-64BIT-NEXT:    sldi 11, 7, 1
+; LE-64BIT-NEXT:    srd 10, 9, 3
 ; LE-64BIT-NEXT:    srd 7, 7, 3
-; LE-64BIT-NEXT:    sldi 10, 8, 1
-; LE-64BIT-NEXT:    srd 11, 9, 3
-; LE-64BIT-NEXT:    srd 8, 8, 3
-; LE-64BIT-NEXT:    sld 6, 10, 6
+; LE-64BIT-NEXT:    sld 8, 11, 8
+; LE-64BIT-NEXT:    or 8, 10, 8
 ; LE-64BIT-NEXT:    subfic 10, 3, 64
 ; LE-64BIT-NEXT:    srd 3, 4, 3
-; LE-64BIT-NEXT:    or 6, 11, 6
 ; LE-64BIT-NEXT:    sld 11, 4, 10
 ; LE-64BIT-NEXT:    sld 9, 9, 10
 ; LE-64BIT-NEXT:    std 3, 24(5)
-; LE-64BIT-NEXT:    or 7, 9, 7
-; LE-64BIT-NEXT:    or 3, 11, 8
-; LE-64BIT-NEXT:    std 6, 8(5)
-; LE-64BIT-NEXT:    std 7, 0(5)
+; LE-64BIT-NEXT:    std 8, 8(5)
+; LE-64BIT-NEXT:    or 6, 9, 6
+; LE-64BIT-NEXT:    or 3, 11, 7
+; LE-64BIT-NEXT:    std 6, 0(5)
 ; LE-64BIT-NEXT:    std 3, 16(5)
 ; LE-64BIT-NEXT:    blr
 ;
@@ -485,44 +471,39 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; BE-NEXT:    ld 8, 16(3)
 ; BE-NEXT:    ld 3, 24(3)
 ; BE-NEXT:    lwz 4, 28(4)
-; BE-NEXT:    addi 9, 1, -64
-; BE-NEXT:    li 10, 0
-; BE-NEXT:    addi 11, 1, -32
-; BE-NEXT:    std 3, 56(9)
-; BE-NEXT:    rlwinm 3, 4, 29, 27, 31
+; BE-NEXT:    li 9, 0
+; BE-NEXT:    addi 10, 1, -32
+; BE-NEXT:    std 9, -40(1)
+; BE-NEXT:    std 9, -48(1)
+; BE-NEXT:    std 9, -56(1)
+; BE-NEXT:    std 9, -64(1)
+; BE-NEXT:    std 3, -8(1)
+; BE-NEXT:    rlwinm 3, 4, 29, 27, 28
 ; BE-NEXT:    neg 3, 3
-; BE-NEXT:    std 10, 24(9)
-; BE-NEXT:    std 10, 16(9)
-; BE-NEXT:    std 10, 8(9)
-; BE-NEXT:    std 10, -64(1)
-; BE-NEXT:    std 8, 48(9)
-; BE-NEXT:    std 7, 40(9)
-; BE-NEXT:    std 6, 32(9)
+; BE-NEXT:    std 8, -16(1)
+; BE-NEXT:    std 7, -24(1)
+; BE-NEXT:    std 6, -32(1)
 ; BE-NEXT:    extsw 3, 3
-; BE-NEXT:    ldux 3, 11, 3
-; BE-NEXT:    li 6, 7
-; BE-NEXT:    nand 6, 4, 6
-; BE-NEXT:    clrlwi 4, 4, 29
-; BE-NEXT:    clrlwi 6, 6, 26
-; BE-NEXT:    ld 7, 8(11)
-; BE-NEXT:    ld 8, 16(11)
-; BE-NEXT:    ld 9, 24(11)
-; BE-NEXT:    subfic 10, 4, 64
-; BE-NEXT:    sldi 11, 7, 1
-; BE-NEXT:    srd 7, 7, 4
-; BE-NEXT:    srd 9, 9, 4
-; BE-NEXT:    sld 6, 11, 6
-; BE-NEXT:    sld 11, 3, 10
-; BE-NEXT:    sld 10, 8, 10
-; BE-NEXT:    srd 8, 8, 4
+; BE-NEXT:    ldux 3, 10, 3
+; BE-NEXT:    clrlwi 4, 4, 26
+; BE-NEXT:    subfic 9, 4, 64
+; BE-NEXT:    ld 6, 8(10)
+; BE-NEXT:    ld 7, 24(10)
+; BE-NEXT:    ld 8, 16(10)
+; BE-NEXT:    sld 10, 3, 9
 ; BE-NEXT:    srd 3, 3, 4
-; BE-NEXT:    or 7, 11, 7
-; BE-NEXT:    or 6, 8, 6
-; BE-NEXT:    or 8, 10, 9
 ; BE-NEXT:    std 3, 0(5)
-; BE-NEXT:    std 8, 24(5)
-; BE-NEXT:    std 7, 8(5)
+; BE-NEXT:    srd 11, 6, 4
+; BE-NEXT:    srd 7, 7, 4
+; BE-NEXT:    sld 6, 6, 9
+; BE-NEXT:    sld 9, 8, 9
+; BE-NEXT:    srd 8, 8, 4
+; BE-NEXT:    or 10, 10, 11
+; BE-NEXT:    or 7, 9, 7
+; BE-NEXT:    or 6, 6, 8
 ; BE-NEXT:    std 6, 16(5)
+; BE-NEXT:    std 7, 24(5)
+; BE-NEXT:    std 10, 8(5)
 ; BE-NEXT:    blr
 ;
 ; LE-32BIT-LABEL: lshr_32bytes:
@@ -538,7 +519,6 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    lwz 0, 24(3)
 ; LE-32BIT-NEXT:    lwz 3, 28(3)
 ; LE-32BIT-NEXT:    lwz 4, 28(4)
-; LE-32BIT-NEXT:    stw 6, 48(1)
 ; LE-32BIT-NEXT:    stw 6, 44(1)
 ; LE-32BIT-NEXT:    stw 6, 40(1)
 ; LE-32BIT-NEXT:    stw 6, 36(1)
@@ -546,68 +526,65 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    stw 6, 28(1)
 ; LE-32BIT-NEXT:    stw 6, 24(1)
 ; LE-32BIT-NEXT:    stw 6, 20(1)
-; LE-32BIT-NEXT:    rlwinm 6, 4, 29, 27, 31
-; LE-32BIT-NEXT:    stw 3, 80(1)
-; LE-32BIT-NEXT:    addi 3, 1, 52
+; LE-32BIT-NEXT:    stw 6, 16(1)
+; LE-32BIT-NEXT:    rlwinm 6, 4, 29, 27, 29
+; LE-32BIT-NEXT:    stw 3, 76(1)
+; LE-32BIT-NEXT:    addi 3, 1, 48
 ; LE-32BIT-NEXT:    stw 25, 84(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    sub 3, 3, 6
 ; LE-32BIT-NEXT:    stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    clrlwi 4, 4, 27
 ; LE-32BIT-NEXT:    stw 27, 92(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 28, 96(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 29, 100(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 30, 104(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 0, 76(1)
-; LE-32BIT-NEXT:    stw 12, 72(1)
-; LE-32BIT-NEXT:    stw 11, 68(1)
-; LE-32BIT-NEXT:    stw 10, 64(1)
-; LE-32BIT-NEXT:    stw 9, 60(1)
-; LE-32BIT-NEXT:    li 9, 7
-; LE-32BIT-NEXT:    stw 8, 56(1)
-; LE-32BIT-NEXT:    nand 9, 4, 9
-; LE-32BIT-NEXT:    stw 7, 52(1)
-; LE-32BIT-NEXT:    clrlwi 4, 4, 29
-; LE-32BIT-NEXT:    lwz 6, 4(3)
 ; LE-32BIT-NEXT:    subfic 30, 4, 32
-; LE-32BIT-NEXT:    lwz 7, 8(3)
-; LE-32BIT-NEXT:    clrlwi 9, 9, 27
-; LE-32BIT-NEXT:    lwz 8, 12(3)
-; LE-32BIT-NEXT:    slwi 29, 6, 1
-; LE-32BIT-NEXT:    lwz 10, 16(3)
-; LE-32BIT-NEXT:    srw 28, 7, 4
-; LE-32BIT-NEXT:    lwz 11, 20(3)
-; LE-32BIT-NEXT:    slwi 27, 8, 1
-; LE-32BIT-NEXT:    lwz 12, 24(3)
+; LE-32BIT-NEXT:    stw 0, 72(1)
+; LE-32BIT-NEXT:    stw 12, 68(1)
+; LE-32BIT-NEXT:    xori 12, 4, 31
+; LE-32BIT-NEXT:    stw 11, 64(1)
+; LE-32BIT-NEXT:    stw 10, 60(1)
+; LE-32BIT-NEXT:    stw 9, 56(1)
+; LE-32BIT-NEXT:    stw 8, 52(1)
+; LE-32BIT-NEXT:    stw 7, 48(1)
+; LE-32BIT-NEXT:    lwz 6, 8(3)
+; LE-32BIT-NEXT:    lwz 7, 4(3)
+; LE-32BIT-NEXT:    lwz 8, 0(3)
+; LE-32BIT-NEXT:    srw 29, 6, 4
+; LE-32BIT-NEXT:    lwz 9, 12(3)
+; LE-32BIT-NEXT:    slw 6, 6, 30
+; LE-32BIT-NEXT:    lwz 10, 20(3)
+; LE-32BIT-NEXT:    slw 28, 8, 30
+; LE-32BIT-NEXT:    lwz 11, 16(3)
+; LE-32BIT-NEXT:    srw 27, 9, 4
+; LE-32BIT-NEXT:    lwz 0, 28(3)
 ; LE-32BIT-NEXT:    srw 26, 10, 4
-; LE-32BIT-NEXT:    lwz 0, 0(3)
-; LE-32BIT-NEXT:    srw 6, 6, 4
-; LE-32BIT-NEXT:    lwz 3, 28(3)
-; LE-32BIT-NEXT:    srw 25, 12, 4
-; LE-32BIT-NEXT:    slw 12, 12, 30
-; LE-32BIT-NEXT:    slw 7, 7, 30
-; LE-32BIT-NEXT:    srw 3, 3, 4
+; LE-32BIT-NEXT:    lwz 3, 24(3)
+; LE-32BIT-NEXT:    slw 25, 11, 30
+; LE-32BIT-NEXT:    slw 9, 9, 30
 ; LE-32BIT-NEXT:    slw 10, 10, 30
-; LE-32BIT-NEXT:    slw 30, 0, 30
-; LE-32BIT-NEXT:    srw 8, 8, 4
+; LE-32BIT-NEXT:    slw 30, 3, 30
+; LE-32BIT-NEXT:    srw 3, 3, 4
 ; LE-32BIT-NEXT:    srw 0, 0, 4
-; LE-32BIT-NEXT:    srw 4, 11, 4
-; LE-32BIT-NEXT:    or 3, 12, 3
+; LE-32BIT-NEXT:    or 3, 10, 3
+; LE-32BIT-NEXT:    srw 11, 11, 4
+; LE-32BIT-NEXT:    stw 3, 24(5)
+; LE-32BIT-NEXT:    or 3, 30, 0
 ; LE-32BIT-NEXT:    stw 3, 28(5)
-; LE-32BIT-NEXT:    or 3, 10, 4
-; LE-32BIT-NEXT:    slwi 11, 11, 1
+; LE-32BIT-NEXT:    or 3, 9, 11
+; LE-32BIT-NEXT:    stw 3, 16(5)
+; LE-32BIT-NEXT:    or 3, 25, 26
+; LE-32BIT-NEXT:    srw 8, 8, 4
+; LE-32BIT-NEXT:    srw 4, 7, 4
+; LE-32BIT-NEXT:    slwi 7, 7, 1
 ; LE-32BIT-NEXT:    stw 3, 20(5)
-; LE-32BIT-NEXT:    or 3, 7, 8
-; LE-32BIT-NEXT:    slw 29, 29, 9
-; LE-32BIT-NEXT:    slw 27, 27, 9
-; LE-32BIT-NEXT:    slw 9, 11, 9
+; LE-32BIT-NEXT:    or 3, 6, 27
+; LE-32BIT-NEXT:    slw 7, 7, 12
 ; LE-32BIT-NEXT:    stw 3, 12(5)
-; LE-32BIT-NEXT:    or 3, 30, 6
+; LE-32BIT-NEXT:    or 3, 28, 4
 ; LE-32BIT-NEXT:    stw 3, 4(5)
-; LE-32BIT-NEXT:    or 3, 25, 9
-; LE-32BIT-NEXT:    stw 3, 24(5)
-; LE-32BIT-NEXT:    or 3, 26, 27
-; LE-32BIT-NEXT:    stw 3, 16(5)
-; LE-32BIT-NEXT:    or 3, 28, 29
-; LE-32BIT-NEXT:    stw 0, 0(5)
+; LE-32BIT-NEXT:    or 3, 29, 7
+; LE-32BIT-NEXT:    stw 8, 0(5)
 ; LE-32BIT-NEXT:    stw 3, 8(5)
 ; LE-32BIT-NEXT:    lwz 30, 104(1) # 4-byte Folded Reload
 ; LE-32BIT-NEXT:    lwz 29, 100(1) # 4-byte Folded Reload
@@ -635,37 +612,33 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-64BIT-NEXT:    lxvd2x 0, 3, 6
 ; LE-64BIT-NEXT:    stxvd2x 2, 7, 6
 ; LE-64BIT-NEXT:    li 6, 48
-; LE-64BIT-NEXT:    rlwinm 3, 4, 29, 27, 31
+; LE-64BIT-NEXT:    rlwinm 3, 4, 29, 27, 28
+; LE-64BIT-NEXT:    clrlwi 4, 4, 26
 ; LE-64BIT-NEXT:    neg 3, 3
 ; LE-64BIT-NEXT:    stxvd2x 0, 7, 6
 ; LE-64BIT-NEXT:    li 6, 32
 ; LE-64BIT-NEXT:    extsw 3, 3
 ; LE-64BIT-NEXT:    stxvd2x 1, 7, 6
 ; LE-64BIT-NEXT:    stxvd2x 2, 0, 7
-; LE-64BIT-NEXT:    li 6, 7
+; LE-64BIT-NEXT:    subfic 6, 4, 64
 ; LE-64BIT-NEXT:    ldux 3, 8, 3
-; LE-64BIT-NEXT:    ld 7, 8(8)
-; LE-64BIT-NEXT:    nand 6, 4, 6
-; LE-64BIT-NEXT:    ld 9, 16(8)
-; LE-64BIT-NEXT:    clrlwi 4, 4, 29
-; LE-64BIT-NEXT:    ld 8, 24(8)
-; LE-64BIT-NEXT:    clrlwi 6, 6, 26
-; LE-64BIT-NEXT:    rldicl 10, 7, 63, 1
-; LE-64BIT-NEXT:    sld 8, 8, 4
+; LE-64BIT-NEXT:    ld 7, 16(8)
+; LE-64BIT-NEXT:    ld 9, 24(8)
+; LE-64BIT-NEXT:    ld 8, 8(8)
+; LE-64BIT-NEXT:    srd 10, 7, 6
+; LE-64BIT-NEXT:    sld 9, 9, 4
 ; LE-64BIT-NEXT:    sld 7, 7, 4
-; LE-64BIT-NEXT:    srd 6, 10, 6
-; LE-64BIT-NEXT:    sld 10, 9, 4
-; LE-64BIT-NEXT:    or 6, 10, 6
-; LE-64BIT-NEXT:    subfic 10, 4, 64
-; LE-64BIT-NEXT:    srd 9, 9, 10
-; LE-64BIT-NEXT:    srd 10, 3, 10
+; LE-64BIT-NEXT:    or 9, 9, 10
+; LE-64BIT-NEXT:    srd 10, 8, 6
+; LE-64BIT-NEXT:    srd 6, 3, 6
+; LE-64BIT-NEXT:    sld 8, 8, 4
 ; LE-64BIT-NEXT:    sld 3, 3, 4
-; LE-64BIT-NEXT:    std 6, 16(5)
-; LE-64BIT-NEXT:    or 7, 7, 10
+; LE-64BIT-NEXT:    or 6, 8, 6
 ; LE-64BIT-NEXT:    std 3, 0(5)
-; LE-64BIT-NEXT:    or 3, 8, 9
-; LE-64BIT-NEXT:    std 7, 8(5)
-; LE-64BIT-NEXT:    std 3, 24(5)
+; LE-64BIT-NEXT:    or 3, 7, 10
+; LE-64BIT-NEXT:    std 9, 24(5)
+; LE-64BIT-NEXT:    std 6, 8(5)
+; LE-64BIT-NEXT:    std 3, 16(5)
 ; LE-64BIT-NEXT:    blr
 ;
 ; BE-LABEL: shl_32bytes:
@@ -675,41 +648,37 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; BE-NEXT:    ld 8, 16(3)
 ; BE-NEXT:    ld 3, 24(3)
 ; BE-NEXT:    lwz 4, 28(4)
-; BE-NEXT:    addi 9, 1, -64
-; BE-NEXT:    li 10, 0
-; BE-NEXT:    std 10, 56(9)
-; BE-NEXT:    std 10, 48(9)
-; BE-NEXT:    std 10, 40(9)
-; BE-NEXT:    std 10, 32(9)
-; BE-NEXT:    std 3, 24(9)
-; BE-NEXT:    std 8, 16(9)
-; BE-NEXT:    std 7, 8(9)
+; BE-NEXT:    li 9, 0
+; BE-NEXT:    addi 10, 1, -64
+; BE-NEXT:    std 9, -8(1)
+; BE-NEXT:    std 9, -16(1)
+; BE-NEXT:    std 9, -24(1)
+; BE-NEXT:    std 9, -32(1)
+; BE-NEXT:    std 3, -40(1)
+; BE-NEXT:    std 8, -48(1)
+; BE-NEXT:    std 7, -56(1)
 ; BE-NEXT:    std 6, -64(1)
-; BE-NEXT:    rlwinm 3, 4, 29, 27, 31
-; BE-NEXT:    ldux 6, 3, 9
-; BE-NEXT:    li 7, 7
-; BE-NEXT:    nand 7, 4, 7
-; BE-NEXT:    clrlwi 4, 4, 29
-; BE-NEXT:    clrlwi 7, 7, 26
-; BE-NEXT:    ld 8, 16(3)
-; BE-NEXT:    ld 9, 8(3)
+; BE-NEXT:    rlwinm 3, 4, 29, 27, 28
+; BE-NEXT:    ldux 6, 3, 10
+; BE-NEXT:    clrlwi 4, 4, 26
+; BE-NEXT:    subfic 9, 4, 64
+; BE-NEXT:    ld 7, 16(3)
+; BE-NEXT:    ld 8, 8(3)
 ; BE-NEXT:    ld 3, 24(3)
-; BE-NEXT:    subfic 10, 4, 64
 ; BE-NEXT:    sld 6, 6, 4
-; BE-NEXT:    rldicl 11, 8, 63, 1
-; BE-NEXT:    sld 8, 8, 4
-; BE-NEXT:    srd 7, 11, 7
-; BE-NEXT:    srd 11, 9, 10
-; BE-NEXT:    sld 9, 9, 4
-; BE-NEXT:    srd 10, 3, 10
+; BE-NEXT:    srd 10, 7, 9
+; BE-NEXT:    sld 11, 8, 4
+; BE-NEXT:    srd 8, 8, 9
+; BE-NEXT:    srd 9, 3, 9
+; BE-NEXT:    sld 7, 7, 4
 ; BE-NEXT:    sld 3, 3, 4
-; BE-NEXT:    or 6, 6, 11
-; BE-NEXT:    or 7, 9, 7
-; BE-NEXT:    or 8, 8, 10
+; BE-NEXT:    or 10, 11, 10
+; BE-NEXT:    or 6, 6, 8
+; BE-NEXT:    or 7, 7, 9
 ; BE-NEXT:    std 3, 24(5)
-; BE-NEXT:    std 8, 16(5)
+; BE-NEXT:    std 7, 16(5)
 ; BE-NEXT:    std 6, 0(5)
-; BE-NEXT:    std 7, 8(5)
+; BE-NEXT:    std 10, 8(5)
 ; BE-NEXT:    blr
 ;
 ; LE-32BIT-LABEL: shl_32bytes:
@@ -731,7 +700,6 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    stw 28, 96(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 29, 100(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 30, 104(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 6, 80(1)
 ; LE-32BIT-NEXT:    stw 6, 76(1)
 ; LE-32BIT-NEXT:    stw 6, 72(1)
 ; LE-32BIT-NEXT:    stw 6, 68(1)
@@ -739,61 +707,56 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    stw 6, 60(1)
 ; LE-32BIT-NEXT:    stw 6, 56(1)
 ; LE-32BIT-NEXT:    stw 6, 52(1)
-; LE-32BIT-NEXT:    rlwinm 6, 4, 29, 27, 31
-; LE-32BIT-NEXT:    stw 3, 48(1)
-; LE-32BIT-NEXT:    addi 3, 1, 20
-; LE-32BIT-NEXT:    stw 0, 44(1)
-; LE-32BIT-NEXT:    stw 12, 40(1)
-; LE-32BIT-NEXT:    stw 11, 36(1)
-; LE-32BIT-NEXT:    stw 10, 32(1)
-; LE-32BIT-NEXT:    stw 9, 28(1)
-; LE-32BIT-NEXT:    stw 8, 24(1)
-; LE-32BIT-NEXT:    li 8, 7
-; LE-32BIT-NEXT:    stw 7, 20(1)
-; LE-32BIT-NEXT:    nand 8, 4, 8
+; LE-32BIT-NEXT:    stw 6, 48(1)
+; LE-32BIT-NEXT:    rlwinm 6, 4, 29, 27, 29
+; LE-32BIT-NEXT:    stw 3, 44(1)
+; LE-32BIT-NEXT:    addi 3, 1, 16
+; LE-32BIT-NEXT:    stw 0, 40(1)
+; LE-32BIT-NEXT:    clrlwi 4, 4, 27
+; LE-32BIT-NEXT:    stw 12, 36(1)
+; LE-32BIT-NEXT:    subfic 12, 4, 32
+; LE-32BIT-NEXT:    stw 11, 32(1)
+; LE-32BIT-NEXT:    stw 10, 28(1)
+; LE-32BIT-NEXT:    stw 9, 24(1)
+; LE-32BIT-NEXT:    stw 8, 20(1)
+; LE-32BIT-NEXT:    stw 7, 16(1)
 ; LE-32BIT-NEXT:    lwzux 3, 6, 3
-; LE-32BIT-NEXT:    clrlwi 4, 4, 29
-; LE-32BIT-NEXT:    subfic 0, 4, 32
-; LE-32BIT-NEXT:    clrlwi 8, 8, 27
 ; LE-32BIT-NEXT:    lwz 7, 8(6)
 ; LE-32BIT-NEXT:    slw 3, 3, 4
-; LE-32BIT-NEXT:    lwz 9, 4(6)
-; LE-32BIT-NEXT:    lwz 10, 16(6)
-; LE-32BIT-NEXT:    srwi 29, 7, 1
-; LE-32BIT-NEXT:    lwz 11, 12(6)
-; LE-32BIT-NEXT:    slw 28, 9, 4
-; LE-32BIT-NEXT:    lwz 12, 24(6)
-; LE-32BIT-NEXT:    srwi 27, 10, 1
-; LE-32BIT-NEXT:    lwz 30, 20(6)
-; LE-32BIT-NEXT:    slw 26, 11, 4
+; LE-32BIT-NEXT:    lwz 8, 4(6)
+; LE-32BIT-NEXT:    lwz 9, 16(6)
+; LE-32BIT-NEXT:    srw 30, 7, 12
+; LE-32BIT-NEXT:    lwz 10, 12(6)
+; LE-32BIT-NEXT:    slw 29, 8, 4
+; LE-32BIT-NEXT:    lwz 11, 24(6)
+; LE-32BIT-NEXT:    srw 8, 8, 12
+; LE-32BIT-NEXT:    lwz 0, 20(6)
+; LE-32BIT-NEXT:    srw 28, 9, 12
 ; LE-32BIT-NEXT:    lwz 6, 28(6)
-; LE-32BIT-NEXT:    srw 9, 9, 0
-; LE-32BIT-NEXT:    slw 25, 30, 4
-; LE-32BIT-NEXT:    srw 11, 11, 0
+; LE-32BIT-NEXT:    slw 27, 10, 4
+; LE-32BIT-NEXT:    srw 10, 10, 12
 ; LE-32BIT-NEXT:    slw 7, 7, 4
-; LE-32BIT-NEXT:    srw 30, 30, 0
-; LE-32BIT-NEXT:    slw 10, 10, 4
-; LE-32BIT-NEXT:    srw 0, 6, 0
-; LE-32BIT-NEXT:    slw 6, 6, 4
-; LE-32BIT-NEXT:    slw 4, 12, 4
-; LE-32BIT-NEXT:    srwi 12, 12, 1
-; LE-32BIT-NEXT:    srw 29, 29, 8
-; LE-32BIT-NEXT:    srw 27, 27, 8
-; LE-32BIT-NEXT:    srw 8, 12, 8
-; LE-32BIT-NEXT:    or 3, 3, 9
-; LE-32BIT-NEXT:    or 4, 4, 0
-; LE-32BIT-NEXT:    stw 3, 0(5)
-; LE-32BIT-NEXT:    or 3, 25, 8
+; LE-32BIT-NEXT:    srw 26, 11, 12
+; LE-32BIT-NEXT:    slw 25, 0, 4
+; LE-32BIT-NEXT:    srw 0, 0, 12
+; LE-32BIT-NEXT:    slw 9, 9, 4
+; LE-32BIT-NEXT:    srw 12, 6, 12
+; LE-32BIT-NEXT:    slw 11, 11, 4
+; LE-32BIT-NEXT:    slw 4, 6, 4
+; LE-32BIT-NEXT:    stw 4, 28(5)
+; LE-32BIT-NEXT:    or 4, 11, 12
 ; LE-32BIT-NEXT:    stw 4, 24(5)
-; LE-32BIT-NEXT:    or 4, 10, 30
-; LE-32BIT-NEXT:    stw 3, 20(5)
-; LE-32BIT-NEXT:    or 3, 26, 27
+; LE-32BIT-NEXT:    or 4, 9, 0
 ; LE-32BIT-NEXT:    stw 4, 16(5)
-; LE-32BIT-NEXT:    or 4, 7, 11
-; LE-32BIT-NEXT:    stw 3, 12(5)
-; LE-32BIT-NEXT:    or 3, 28, 29
-; LE-32BIT-NEXT:    stw 6, 28(5)
+; LE-32BIT-NEXT:    or 4, 25, 26
+; LE-32BIT-NEXT:    stw 4, 20(5)
+; LE-32BIT-NEXT:    or 4, 7, 10
+; LE-32BIT-NEXT:    or 3, 3, 8
 ; LE-32BIT-NEXT:    stw 4, 8(5)
+; LE-32BIT-NEXT:    or 4, 27, 28
+; LE-32BIT-NEXT:    stw 3, 0(5)
+; LE-32BIT-NEXT:    or 3, 29, 30
+; LE-32BIT-NEXT:    stw 4, 12(5)
 ; LE-32BIT-NEXT:    stw 3, 4(5)
 ; LE-32BIT-NEXT:    lwz 30, 104(1) # 4-byte Folded Reload
 ; LE-32BIT-NEXT:    lwz 29, 100(1) # 4-byte Folded Reload
@@ -812,98 +775,91 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-64BIT-LABEL: ashr_32bytes:
 ; LE-64BIT:       # %bb.0:
-; LE-64BIT-NEXT:    lxvd2x 0, 0, 3
 ; LE-64BIT-NEXT:    ld 6, 24(3)
+; LE-64BIT-NEXT:    lxvd2x 0, 0, 3
 ; LE-64BIT-NEXT:    lwz 4, 0(4)
 ; LE-64BIT-NEXT:    addi 7, 1, -64
 ; LE-64BIT-NEXT:    ld 3, 16(3)
 ; LE-64BIT-NEXT:    sradi 8, 6, 63
-; LE-64BIT-NEXT:    rlwinm 9, 4, 29, 27, 31
-; LE-64BIT-NEXT:    std 6, 24(7)
-; LE-64BIT-NEXT:    std 3, 16(7)
-; LE-64BIT-NEXT:    li 3, 7
-; LE-64BIT-NEXT:    std 8, 56(7)
-; LE-64BIT-NEXT:    std 8, 48(7)
-; LE-64BIT-NEXT:    std 8, 40(7)
-; LE-64BIT-NEXT:    std 8, 32(7)
+; LE-64BIT-NEXT:    rlwinm 9, 4, 29, 27, 28
+; LE-64BIT-NEXT:    clrlwi 4, 4, 26
 ; LE-64BIT-NEXT:    stxvd2x 0, 0, 7
-; LE-64BIT-NEXT:    nand 3, 4, 3
-; LE-64BIT-NEXT:    clrlwi 4, 4, 29
-; LE-64BIT-NEXT:    ldux 6, 9, 7
-; LE-64BIT-NEXT:    ld 7, 16(9)
+; LE-64BIT-NEXT:    std 6, -40(1)
+; LE-64BIT-NEXT:    std 3, -48(1)
+; LE-64BIT-NEXT:    std 8, -8(1)
+; LE-64BIT-NEXT:    std 8, -16(1)
+; LE-64BIT-NEXT:    std 8, -24(1)
+; LE-64BIT-NEXT:    std 8, -32(1)
+; LE-64BIT-NEXT:    ldux 3, 9, 7
+; LE-64BIT-NEXT:    xori 7, 4, 63
+; LE-64BIT-NEXT:    ld 6, 16(9)
 ; LE-64BIT-NEXT:    ld 8, 8(9)
-; LE-64BIT-NEXT:    clrlwi 3, 3, 26
 ; LE-64BIT-NEXT:    ld 9, 24(9)
+; LE-64BIT-NEXT:    srd 3, 3, 4
+; LE-64BIT-NEXT:    sldi 11, 6, 1
+; LE-64BIT-NEXT:    srd 10, 8, 4
 ; LE-64BIT-NEXT:    srd 6, 6, 4
-; LE-64BIT-NEXT:    sldi 10, 7, 1
-; LE-64BIT-NEXT:    srd 11, 8, 4
-; LE-64BIT-NEXT:    srd 7, 7, 4
-; LE-64BIT-NEXT:    sld 3, 10, 3
+; LE-64BIT-NEXT:    sld 7, 11, 7
+; LE-64BIT-NEXT:    or 7, 10, 7
 ; LE-64BIT-NEXT:    subfic 10, 4, 64
 ; LE-64BIT-NEXT:    srad 4, 9, 4
-; LE-64BIT-NEXT:    or 3, 11, 3
-; LE-64BIT-NEXT:    sld 11, 9, 10
 ; LE-64BIT-NEXT:    sld 8, 8, 10
+; LE-64BIT-NEXT:    sld 11, 9, 10
 ; LE-64BIT-NEXT:    std 4, 24(5)
-; LE-64BIT-NEXT:    or 6, 8, 6
-; LE-64BIT-NEXT:    or 4, 11, 7
-; LE-64BIT-NEXT:    std 3, 8(5)
-; LE-64BIT-NEXT:    std 6, 0(5)
-; LE-64BIT-NEXT:    std 4, 16(5)
+; LE-64BIT-NEXT:    std 7, 8(5)
+; LE-64BIT-NEXT:    or 3, 8, 3
+; LE-64BIT-NEXT:    std 3, 0(5)
+; LE-64BIT-NEXT:    or 3, 11, 6
+; LE-64BIT-NEXT:    std 3, 16(5)
 ; LE-64BIT-NEXT:    blr
 ;
 ; BE-LABEL: ashr_32bytes:
 ; BE:       # %bb.0:
-; BE-NEXT:    ld 6, 0(3)
-; BE-NEXT:    ld 7, 8(3)
-; BE-NEXT:    ld 8, 16(3)
+; BE-NEXT:    ld 7, 0(3)
+; BE-NEXT:    ld 8, 8(3)
+; BE-NEXT:    ld 9, 16(3)
 ; BE-NEXT:    ld 3, 24(3)
 ; BE-NEXT:    lwz 4, 28(4)
-; BE-NEXT:    addi 9, 1, -64
-; BE-NEXT:    addi 10, 1, -32
-; BE-NEXT:    std 3, 56(9)
-; BE-NEXT:    std 6, 32(9)
-; BE-NEXT:    sradi 3, 6, 63
-; BE-NEXT:    rlwinm 6, 4, 29, 27, 31
-; BE-NEXT:    std 3, 24(9)
-; BE-NEXT:    std 3, 16(9)
-; BE-NEXT:    std 3, 8(9)
+; BE-NEXT:    addi 6, 1, -32
+; BE-NEXT:    std 3, -8(1)
+; BE-NEXT:    std 7, -32(1)
+; BE-NEXT:    sradi 3, 7, 63
+; BE-NEXT:    rlwinm 7, 4, 29, 27, 28
+; BE-NEXT:    std 3, -40(1)
+; BE-NEXT:    std 3, -48(1)
+; BE-NEXT:    std 3, -56(1)
 ; BE-NEXT:    std 3, -64(1)
-; BE-NEXT:    neg 3, 6
-; BE-NEXT:    std 8, 48(9)
-; BE-NEXT:    std 7, 40(9)
+; BE-NEXT:    neg 3, 7
+; BE-NEXT:    std 9, -16(1)
+; BE-NEXT:    std 8, -24(1)
 ; BE-NEXT:    extsw 3, 3
-; BE-NEXT:    ldux 3, 10, 3
-; BE-NEXT:    li 6, 7
-; BE-NEXT:    nand 6, 4, 6
-; BE-NEXT:    clrlwi 4, 4, 29
-; BE-NEXT:    clrlwi 6, 6, 26
-; BE-NEXT:    ld 7, 8(10)
-; BE-NEXT:    ld 8, 16(10)
-; BE-NEXT:    ld 9, 24(10)
-; BE-NEXT:    subfic 10, 4, 64
-; BE-NEXT:    sldi 11, 7, 1
-; BE-NEXT:    srd 7, 7, 4
-; BE-NEXT:    srd 9, 9, 4
-; BE-NEXT:    sld 6, 11, 6
-; BE-NEXT:    sld 11, 3, 10
-; BE-NEXT:    sld 10, 8, 10
-; BE-NEXT:    srd 8, 8, 4
+; BE-NEXT:    ldux 3, 6, 3
+; BE-NEXT:    clrlwi 4, 4, 26
+; BE-NEXT:    subfic 9, 4, 64
+; BE-NEXT:    ld 7, 8(6)
+; BE-NEXT:    ld 8, 24(6)
+; BE-NEXT:    ld 6, 16(6)
+; BE-NEXT:    sld 10, 3, 9
 ; BE-NEXT:    srad 3, 3, 4
-; BE-NEXT:    or 7, 11, 7
-; BE-NEXT:    or 6, 8, 6
-; BE-NEXT:    or 8, 10, 9
 ; BE-NEXT:    std 3, 0(5)
-; BE-NEXT:    std 8, 24(5)
-; BE-NEXT:    std 7, 8(5)
+; BE-NEXT:    srd 11, 7, 4
+; BE-NEXT:    srd 8, 8, 4
+; BE-NEXT:    sld 7, 7, 9
+; BE-NEXT:    sld 9, 6, 9
+; BE-NEXT:    srd 6, 6, 4
+; BE-NEXT:    or 10, 10, 11
+; BE-NEXT:    or 8, 9, 8
+; BE-NEXT:    or 6, 7, 6
 ; BE-NEXT:    std 6, 16(5)
+; BE-NEXT:    std 8, 24(5)
+; BE-NEXT:    std 10, 8(5)
 ; BE-NEXT:    blr
 ;
 ; LE-32BIT-LABEL: ashr_32bytes:
 ; LE-32BIT:       # %bb.0:
 ; LE-32BIT-NEXT:    stwu 1, -112(1)
 ; LE-32BIT-NEXT:    lwz 7, 0(3)
-; LE-32BIT-NEXT:    addi 6, 1, 52
+; LE-32BIT-NEXT:    addi 6, 1, 48
 ; LE-32BIT-NEXT:    lwz 8, 4(3)
 ; LE-32BIT-NEXT:    lwz 9, 8(3)
 ; LE-32BIT-NEXT:    lwz 10, 12(3)
@@ -912,76 +868,72 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; LE-32BIT-NEXT:    lwz 0, 24(3)
 ; LE-32BIT-NEXT:    lwz 3, 28(3)
 ; LE-32BIT-NEXT:    lwz 4, 28(4)
-; LE-32BIT-NEXT:    stw 3, 80(1)
+; LE-32BIT-NEXT:    stw 3, 76(1)
 ; LE-32BIT-NEXT:    srawi 3, 7, 31
-; LE-32BIT-NEXT:    stw 7, 52(1)
-; LE-32BIT-NEXT:    rlwinm 7, 4, 29, 27, 31
+; LE-32BIT-NEXT:    stw 7, 48(1)
+; LE-32BIT-NEXT:    rlwinm 7, 4, 29, 27, 29
 ; LE-32BIT-NEXT:    stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    clrlwi 4, 4, 27
 ; LE-32BIT-NEXT:    stw 26, 88(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 27, 92(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 28, 96(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 29, 100(1) # 4-byte Folded Spill
 ; LE-32BIT-NEXT:    stw 30, 104(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT:    stw 0, 76(1)
-; LE-32BIT-NEXT:    stw 12, 72(1)
-; LE-32BIT-NEXT:    stw 11, 68(1)
-; LE-32BIT-NEXT:    stw 10, 64(1)
-; LE-32BIT-NEXT:    stw 9, 60(1)
-; LE-32BIT-NEXT:    li 9, 7
-; LE-32BIT-NEXT:    stw 8, 56(1)
-; LE-32BIT-NEXT:    nand 9, 4, 9
-; LE-32BIT-NEXT:    stw 3, 48(1)
-; LE-32BIT-NEXT:    clrlwi 4, 4, 29
-; LE-32BIT-NEXT:    stw 3, 44(1)
 ; LE-32BIT-NEXT:    subfic 30, 4, 32
+; LE-32BIT-NEXT:    stw 0, 72(1)
+; LE-32BIT-NEXT:    stw 12, 68(1)
+; LE-32BIT-NEXT:    xori 12, 4, 31
+; LE-32BIT-NEXT:    stw 11, 64(1)
+; LE-32BIT-NEXT:    stw 10, 60(1)
+; LE-32BIT-NEXT:    stw 9, 56(1)
+; LE-32BIT-NEXT:    stw 8, 52(1)
+; LE-32BIT-NEXT:    stw 3, 44(1)
 ; LE-32BIT-NEXT:    stw 3, 40(1)
-; LE-32BIT-NEXT:    clrlwi 9, 9, 27
 ; LE-32BIT-NEXT:    stw 3, 36(1)
 ; LE-32BIT-NEXT:    stw 3, 32(1)
 ; LE-32BIT-NEXT:    stw 3, 28(1)
 ; LE-32BIT-NEXT:    stw 3, 24(1)
 ; LE-32BIT-NEXT:    stw 3, 20(1)
+; LE-32BIT-NEXT:    stw 3, 16(1)
 ; LE-32BIT-NEXT:    sub 3, 6, 7
-; LE-32BIT-NEXT:    lwz 6, 4(3)
-; LE-32BIT-NEXT:    lwz 7, 8(3)
-; LE-32BIT-NEXT:    lwz 8, 12(3)
-; LE-32BIT-NEXT:    slwi 29, 6, 1
-; LE-32BIT-NEXT:    lwz 10, 16(3)
-; LE-32BIT-NEXT:    srw 28, 7, 4
-; LE-32BIT-NEXT:    lwz 11, 20(3)
-; LE-32BIT-NEXT:    slwi 27, 8, 1
-; LE-32BIT-NEXT:    lwz 12, 24(3)
+; LE-32BIT-NEXT:    lwz 6, 8(3)
+; LE-32BIT-NEXT:    lwz 7, 4(3)
+; LE-32BIT-NEXT:    lwz 8, 0(3)
+; LE-32BIT-NEXT:    srw 29, 6, 4
+; LE-32BIT-NEXT:    lwz 9, 12(3)
+; LE-32BIT-NEXT:    slw 6, 6, 30
+; LE-32BIT-NEXT:    lwz 10, 20(3)
+; LE-32BIT-NEXT:    slw 28, 8, 30
+; LE-32BIT-NEXT:    lwz 11, 16(3)
+; LE-32BIT-NEXT:    srw 27, 9, 4
+; LE-32BIT-NEXT:    lwz 0, 28(3)
 ; LE-32BIT-NEXT:    srw 26, 10, 4
-; LE-32BIT-NEXT:    lwz 0, 0(3)
-; LE-32BIT-NEXT:    srw 6, 6, 4
-; LE-32BIT-NEXT:    lwz 3, 28(3)
-; LE-32BIT-NEXT:    srw 25, 12, 4
-; LE-32BIT-NEXT:    slw 12, 12, 30
-; LE-32BIT-NEXT:    slw 7, 7, 30
-; LE-32BIT-NEXT:    srw 3, 3, 4
+; LE-32BIT-NEXT:    lwz 3, 24(3)
+; LE-32BIT-NEXT:    slw 25, 11, 30
+; LE-32BIT-NEXT:    slw 9, 9, 30
 ; LE-32BIT-NEXT:    slw 10, 10, 30
-; LE-32BIT-NEXT:    slw 30, 0, 30
-; LE-32BIT-NEXT:    srw 8, 8, 4
-; LE-32BIT-NEXT:    sraw 0, 0, 4
-; LE-32BIT-NEXT:    srw 4, 11, 4
-; LE-32BIT-NEXT:    or 3, 12, 3
+; LE-32BIT-NEXT:    slw 30, 3, 30
+; LE-32BIT-NEXT:    srw 3, 3, 4
+; LE-32BIT-NEXT:    srw 0, 0, 4
+; LE-32BIT-NEXT:    or 3, 10, 3
+; LE-32BIT-NEXT:    srw 11, 11, 4
+; LE-32BIT-NEXT:    stw 3, 24(5)
+; LE-32BIT-NEXT:    or 3, 30, 0
 ; LE-32BIT-NEXT:    stw 3, 28(5)
-; LE-32BIT-NEXT:    or 3, 10, 4
-; LE-32BIT-NEXT:    slwi 11, 11, 1
+; LE-32BIT-NEXT:    or 3, 9, 11
+; LE-32BIT-NEXT:    stw 3, 16(5)
+; LE-32BIT-NEXT:    or 3, 25, 26
+; LE-32BIT-NEXT:    sraw 8, 8, 4
+; LE-32BIT-NEXT:    srw 4, 7, 4
+; LE-32BIT-NEXT:    slwi 7, 7, 1
 ; LE-32BIT-NEXT:    stw 3, 20(5)
-; LE-32BIT-NEXT:    or 3, 7, 8
-; LE-32BIT-NEXT:    slw 29, 29, 9
-; LE-32BIT-NEXT:    slw 27, 27, 9
-; LE-32BIT-NEXT:    slw 9, 11, 9
+; LE-32BIT-NEXT:    or 3, 6, 27
+; LE-32BIT-NEXT:    slw 7, 7, 12
 ; LE-32BIT-NEXT:    stw 3, 12(5)
-; LE-32BIT-NEXT:    or 3, 30, 6
+; LE-32BIT-NEXT:    or 3, 28, 4
 ; LE-32BIT-NEXT:    stw 3, 4(5)
-; LE-32BIT-NEXT:    or 3, 25, 9
-; LE-32BIT-NEXT:    stw 3, 24(5)
-; LE-32BIT-NEXT:    or 3, 26, 27
-; LE-32BIT-NEXT:    stw 3, 16(5)
-; LE-32BIT-NEXT:    or 3, 28, 29
-; LE-32BIT-NEXT:    stw 0, 0(5)
+; LE-32BIT-NEXT:    or 3, 29, 7
+; LE-32BIT-NEXT:    stw 8, 0(5)
 ; LE-32BIT-NEXT:    stw 3, 8(5)
 ; LE-32BIT-NEXT:    lwz 30, 104(1) # 4-byte Folded Reload
 ; LE-32BIT-NEXT:    lwz 29, 100(1) # 4-byte Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index f61cbfd3ed7257..5ba8755201ddf5 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -157,106 +157,33 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a5, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
-; RV32I-NEXT:    sb zero, 31(sp)
-; RV32I-NEXT:    sb zero, 30(sp)
-; RV32I-NEXT:    sb zero, 29(sp)
-; RV32I-NEXT:    sb zero, 28(sp)
-; RV32I-NEXT:    sb zero, 27(sp)
-; RV32I-NEXT:    sb zero, 26(sp)
-; RV32I-NEXT:    sb zero, 25(sp)
-; RV32I-NEXT:    sb zero, 24(sp)
-; RV32I-NEXT:    sb zero, 23(sp)
-; RV32I-NEXT:    sb zero, 22(sp)
-; RV32I-NEXT:    sb zero, 21(sp)
-; RV32I-NEXT:    sb zero, 20(sp)
-; RV32I-NEXT:    sb zero, 19(sp)
-; RV32I-NEXT:    sb zero, 18(sp)
-; RV32I-NEXT:    sb zero, 17(sp)
-; RV32I-NEXT:    sb zero, 16(sp)
-; RV32I-NEXT:    sb a1, 12(sp)
-; RV32I-NEXT:    sb a5, 8(sp)
-; RV32I-NEXT:    sb a4, 4(sp)
-; RV32I-NEXT:    sb a3, 0(sp)
-; RV32I-NEXT:    srli a6, a1, 24
-; RV32I-NEXT:    sb a6, 15(sp)
-; RV32I-NEXT:    srli a6, a1, 16
-; RV32I-NEXT:    sb a6, 14(sp)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 13(sp)
-; RV32I-NEXT:    srli a1, a5, 24
-; RV32I-NEXT:    sb a1, 11(sp)
-; RV32I-NEXT:    srli a1, a5, 16
-; RV32I-NEXT:    sb a1, 10(sp)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 9(sp)
-; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    sb a1, 7(sp)
-; RV32I-NEXT:    srli a1, a4, 16
-; RV32I-NEXT:    sb a1, 6(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 5(sp)
-; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    sb a1, 3(sp)
-; RV32I-NEXT:    srli a1, a3, 16
-; RV32I-NEXT:    sb a1, 2(sp)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 1(sp)
-; RV32I-NEXT:    slli a1, a2, 25
-; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    srli a1, a2, 3
+; RV32I-NEXT:    andi a1, a1, 12
 ; RV32I-NEXT:    mv a3, sp
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    lbu a3, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a6, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    andi a2, a2, 7
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    srl a3, a3, a2
-; RV32I-NEXT:    lbu a4, 5(a1)
-; RV32I-NEXT:    lbu a5, 4(a1)
-; RV32I-NEXT:    lbu a6, 6(a1)
-; RV32I-NEXT:    lbu a7, 7(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
 ; RV32I-NEXT:    slli a5, a4, 1
-; RV32I-NEXT:    xori a6, a2, 31
+; RV32I-NEXT:    andi a6, a2, 31
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    lw a7, 8(a1)
 ; RV32I-NEXT:    sll a5, a5, a6
 ; RV32I-NEXT:    or a3, a3, a5
 ; RV32I-NEXT:    srl a4, a4, a2
-; RV32I-NEXT:    lbu a5, 9(a1)
-; RV32I-NEXT:    lbu a7, 8(a1)
-; RV32I-NEXT:    lbu t0, 10(a1)
-; RV32I-NEXT:    lbu t1, 11(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    slli a7, a5, 1
-; RV32I-NEXT:    not t0, a2
-; RV32I-NEXT:    lbu t1, 13(a1)
-; RV32I-NEXT:    sll a7, a7, t0
-; RV32I-NEXT:    or a4, a4, a7
-; RV32I-NEXT:    lbu a7, 12(a1)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    lbu t0, 14(a1)
-; RV32I-NEXT:    lbu a1, 15(a1)
-; RV32I-NEXT:    or a7, t1, a7
-; RV32I-NEXT:    srl a5, a5, a2
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    slli a5, a7, 1
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sll a5, a5, a6
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    srl a5, a7, a2
 ; RV32I-NEXT:    slli a7, a1, 1
 ; RV32I-NEXT:    sll a6, a7, a6
 ; RV32I-NEXT:    or a5, a5, a6
@@ -299,110 +226,34 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a4, 8(a1)
 ; RV32I-NEXT:    lw a5, 4(a1)
 ; RV32I-NEXT:    lw a1, 0(a1)
-; RV32I-NEXT:    sb a3, 12(sp)
-; RV32I-NEXT:    sb a4, 8(sp)
-; RV32I-NEXT:    sb a5, 4(sp)
-; RV32I-NEXT:    sb a1, 0(sp)
-; RV32I-NEXT:    srai a6, a3, 31
-; RV32I-NEXT:    sb a6, 28(sp)
-; RV32I-NEXT:    sb a6, 24(sp)
-; RV32I-NEXT:    sb a6, 20(sp)
-; RV32I-NEXT:    sb a6, 16(sp)
-; RV32I-NEXT:    srli a7, a3, 24
-; RV32I-NEXT:    sb a7, 15(sp)
-; RV32I-NEXT:    srli a7, a3, 16
-; RV32I-NEXT:    sb a7, 14(sp)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 13(sp)
-; RV32I-NEXT:    srli a3, a4, 24
-; RV32I-NEXT:    sb a3, 11(sp)
-; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 10(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 9(sp)
-; RV32I-NEXT:    srli a3, a5, 24
-; RV32I-NEXT:    sb a3, 7(sp)
-; RV32I-NEXT:    srli a3, a5, 16
-; RV32I-NEXT:    sb a3, 6(sp)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 5(sp)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    sb a3, 3(sp)
-; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    sb a3, 2(sp)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 1(sp)
-; RV32I-NEXT:    srli a1, a6, 24
-; RV32I-NEXT:    sb a1, 31(sp)
-; RV32I-NEXT:    srli a3, a6, 16
-; RV32I-NEXT:    sb a3, 30(sp)
-; RV32I-NEXT:    srli a4, a6, 8
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a1, 27(sp)
-; RV32I-NEXT:    sb a3, 26(sp)
-; RV32I-NEXT:    sb a4, 25(sp)
-; RV32I-NEXT:    sb a1, 23(sp)
-; RV32I-NEXT:    sb a3, 22(sp)
-; RV32I-NEXT:    sb a4, 21(sp)
-; RV32I-NEXT:    sb a1, 19(sp)
-; RV32I-NEXT:    sb a3, 18(sp)
-; RV32I-NEXT:    sb a4, 17(sp)
-; RV32I-NEXT:    slli a1, a2, 25
-; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    sw a3, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 4(sp)
+; RV32I-NEXT:    sw a1, 0(sp)
+; RV32I-NEXT:    srai a3, a3, 31
+; RV32I-NEXT:    sw a3, 28(sp)
+; RV32I-NEXT:    sw a3, 24(sp)
+; RV32I-NEXT:    sw a3, 20(sp)
+; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    srli a1, a2, 3
+; RV32I-NEXT:    andi a1, a1, 12
 ; RV32I-NEXT:    mv a3, sp
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    lbu a3, 1(a1)
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 2(a1)
-; RV32I-NEXT:    lbu a6, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    andi a2, a2, 7
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    srl a3, a3, a2
-; RV32I-NEXT:    lbu a4, 5(a1)
-; RV32I-NEXT:    lbu a5, 4(a1)
-; RV32I-NEXT:    lbu a6, 6(a1)
-; RV32I-NEXT:    lbu a7, 7(a1)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a4, a5, a4
 ; RV32I-NEXT:    slli a5, a4, 1
-; RV32I-NEXT:    xori a6, a2, 31
+; RV32I-NEXT:    andi a6, a2, 31
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    lw a7, 8(a1)
 ; RV32I-NEXT:    sll a5, a5, a6
 ; RV32I-NEXT:    or a3, a3, a5
 ; RV32I-NEXT:    srl a4, a4, a2
-; RV32I-NEXT:    lbu a5, 9(a1)
-; RV32I-NEXT:    lbu a7, 8(a1)
-; RV32I-NEXT:    lbu t0, 10(a1)
-; RV32I-NEXT:    lbu t1, 11(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a7, t1, t0
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    slli a7, a5, 1
-; RV32I-NEXT:    not t0, a2
-; RV32I-NEXT:    lbu t1, 13(a1)
-; RV32I-NEXT:    sll a7, a7, t0
-; RV32I-NEXT:    or a4, a4, a7
-; RV32I-NEXT:    lbu a7, 12(a1)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    lbu t0, 14(a1)
-; RV32I-NEXT:    lbu a1, 15(a1)
-; RV32I-NEXT:    or a7, t1, a7
-; RV32I-NEXT:    srl a5, a5, a2
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    slli a5, a7, 1
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    sll a5, a5, a6
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    srl a5, a7, a2
 ; RV32I-NEXT:    slli a7, a1, 1
 ; RV32I-NEXT:    sll a6, a7, a6
 ; RV32I-NEXT:    or a5, a5, a6
@@ -445,114 +296,41 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a5, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
-; RV32I-NEXT:    sb zero, 15(sp)
-; RV32I-NEXT:    sb zero, 14(sp)
-; RV32I-NEXT:    sb zero, 13(sp)
-; RV32I-NEXT:    sb zero, 12(sp)
-; RV32I-NEXT:    sb zero, 11(sp)
-; RV32I-NEXT:    sb zero, 10(sp)
-; RV32I-NEXT:    sb zero, 9(sp)
-; RV32I-NEXT:    sb zero, 8(sp)
-; RV32I-NEXT:    sb zero, 7(sp)
-; RV32I-NEXT:    sb zero, 6(sp)
-; RV32I-NEXT:    sb zero, 5(sp)
-; RV32I-NEXT:    sb zero, 4(sp)
-; RV32I-NEXT:    sb zero, 3(sp)
-; RV32I-NEXT:    sb zero, 2(sp)
-; RV32I-NEXT:    sb zero, 1(sp)
-; RV32I-NEXT:    sb zero, 0(sp)
-; RV32I-NEXT:    sb a1, 28(sp)
-; RV32I-NEXT:    sb a5, 24(sp)
-; RV32I-NEXT:    sb a4, 20(sp)
-; RV32I-NEXT:    sb a3, 16(sp)
-; RV32I-NEXT:    srli a6, a1, 24
-; RV32I-NEXT:    sb a6, 31(sp)
-; RV32I-NEXT:    srli a6, a1, 16
-; RV32I-NEXT:    sb a6, 30(sp)
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 29(sp)
-; RV32I-NEXT:    srli a1, a5, 24
-; RV32I-NEXT:    sb a1, 27(sp)
-; RV32I-NEXT:    srli a1, a5, 16
-; RV32I-NEXT:    sb a1, 26(sp)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 25(sp)
-; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    sb a1, 23(sp)
-; RV32I-NEXT:    srli a1, a4, 16
-; RV32I-NEXT:    sb a1, 22(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 21(sp)
-; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    sb a1, 19(sp)
-; RV32I-NEXT:    srli a1, a3, 16
-; RV32I-NEXT:    sb a1, 18(sp)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 17(sp)
-; RV32I-NEXT:    slli a1, a2, 25
-; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw a1, 28(sp)
+; RV32I-NEXT:    sw a5, 24(sp)
+; RV32I-NEXT:    sw a4, 20(sp)
+; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    srli a1, a2, 3
+; RV32I-NEXT:    andi a1, a1, 12
 ; RV32I-NEXT:    addi a3, sp, 16
-; RV32I-NEXT:    sub a1, a3, a1
-; RV32I-NEXT:    lbu a3, 5(a1)
-; RV32I-NEXT:    lbu a4, 4(a1)
-; RV32I-NEXT:    lbu a5, 6(a1)
-; RV32I-NEXT:    lbu a6, 7(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    andi a2, a2, 7
-; RV32I-NEXT:    sll a4, a3, a2
-; RV32I-NEXT:    lbu a5, 1(a1)
-; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 2(a1)
-; RV32I-NEXT:    lbu t0, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    srli a6, a5, 1
-; RV32I-NEXT:    xori a7, a2, 31
+; RV32I-NEXT:    sub a3, a3, a1
+; RV32I-NEXT:    lw a1, 4(a3)
+; RV32I-NEXT:    lw a4, 0(a3)
+; RV32I-NEXT:    sll a5, a1, a2
+; RV32I-NEXT:    srli a6, a4, 1
+; RV32I-NEXT:    andi a7, a2, 31
+; RV32I-NEXT:    lw t0, 8(a3)
+; RV32I-NEXT:    xori a7, a7, 31
 ; RV32I-NEXT:    srl a6, a6, a7
-; RV32I-NEXT:    or a4, a4, a6
-; RV32I-NEXT:    lbu a6, 9(a1)
-; RV32I-NEXT:    lbu t0, 8(a1)
-; RV32I-NEXT:    lbu t1, 10(a1)
-; RV32I-NEXT:    lbu t2, 11(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a6, a6, t0
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a6, t0, a6
-; RV32I-NEXT:    sll t0, a6, a2
-; RV32I-NEXT:    srli a3, a3, 1
-; RV32I-NEXT:    not t1, a2
-; RV32I-NEXT:    srl a3, a3, t1
-; RV32I-NEXT:    or a3, t0, a3
-; RV32I-NEXT:    lbu t0, 13(a1)
-; RV32I-NEXT:    lbu t1, 12(a1)
-; RV32I-NEXT:    lbu t2, 14(a1)
-; RV32I-NEXT:    lbu a1, 15(a1)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t2
-; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    sll a1, a1, a2
-; RV32I-NEXT:    srli a6, a6, 1
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    sll a6, t0, a2
+; RV32I-NEXT:    lw a3, 12(a3)
+; RV32I-NEXT:    srli a1, a1, 1
+; RV32I-NEXT:    srl a1, a1, a7
+; RV32I-NEXT:    or a1, a6, a1
+; RV32I-NEXT:    sll a3, a3, a2
+; RV32I-NEXT:    srli a6, t0, 1
 ; RV32I-NEXT:    srl a6, a6, a7
-; RV32I-NEXT:    or a1, a1, a6
-; RV32I-NEXT:    sll a2, a5, a2
+; RV32I-NEXT:    or a3, a3, a6
+; RV32I-NEXT:    sll a2, a4, a2
 ; RV32I-NEXT:    sw a2, 0(a0)
-; RV32I-NEXT:    sw a1, 12(a0)
-; RV32I-NEXT:    sw a3, 8(a0)
-; RV32I-NEXT:    sw a4, 4(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
+; RV32I-NEXT:    sw a1, 8(a0)
+; RV32I-NEXT:    sw a5, 4(a0)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index b0d435368e92bd..29fe0a7de6b3d4 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -723,98 +723,117 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: lshr_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -48
-; RV32I-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sb zero, 35(sp)
-; RV32I-NEXT:    sb zero, 34(sp)
-; RV32I-NEXT:    sb zero, 33(sp)
-; RV32I-NEXT:    sb zero, 32(sp)
-; RV32I-NEXT:    sb zero, 31(sp)
-; RV32I-NEXT:    sb zero, 30(sp)
-; RV32I-NEXT:    sb zero, 29(sp)
-; RV32I-NEXT:    sb zero, 28(sp)
-; RV32I-NEXT:    sb zero, 27(sp)
-; RV32I-NEXT:    sb zero, 26(sp)
-; RV32I-NEXT:    sb zero, 25(sp)
-; RV32I-NEXT:    sb zero, 24(sp)
-; RV32I-NEXT:    sb zero, 23(sp)
-; RV32I-NEXT:    sb zero, 22(sp)
-; RV32I-NEXT:    sb zero, 21(sp)
-; RV32I-NEXT:    sb zero, 20(sp)
-; RV32I-NEXT:    sb a0, 19(sp)
-; RV32I-NEXT:    sb s2, 18(sp)
-; RV32I-NEXT:    sb s1, 17(sp)
-; RV32I-NEXT:    sb s0, 16(sp)
-; RV32I-NEXT:    sb t6, 15(sp)
-; RV32I-NEXT:    sb t5, 14(sp)
-; RV32I-NEXT:    sb t4, 13(sp)
-; RV32I-NEXT:    sb t3, 12(sp)
-; RV32I-NEXT:    sb t2, 11(sp)
-; RV32I-NEXT:    sb t1, 10(sp)
-; RV32I-NEXT:    sb t0, 9(sp)
-; RV32I-NEXT:    sb a7, 8(sp)
-; RV32I-NEXT:    sb a6, 7(sp)
-; RV32I-NEXT:    sb a5, 6(sp)
-; RV32I-NEXT:    sb a4, 5(sp)
-; RV32I-NEXT:    sb a3, 4(sp)
-; RV32I-NEXT:    andi a1, a1, 15
-; RV32I-NEXT:    addi a0, sp, 4
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 7(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 12(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu t5, 14(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 8(a0)
-; RV32I-NEXT:    lbu a0, 9(a0)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    sb t5, 14(a2)
-; RV32I-NEXT:    sb t4, 15(a2)
-; RV32I-NEXT:    sb t3, 12(a2)
-; RV32I-NEXT:    sb t2, 13(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu a7, 0(a1)
+; RV32I-NEXT:    lbu t0, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    andi a0, a1, 12
+; RV32I-NEXT:    mv a3, sp
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    lw a3, 4(a0)
+; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    srl a4, a3, a1
+; RV32I-NEXT:    lw a5, 8(a0)
+; RV32I-NEXT:    andi a6, a1, 24
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    lw a7, 0(a0)
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or t0, a4, t0
+; RV32I-NEXT:    srl a7, a7, a1
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    sll a3, a3, a6
+; RV32I-NEXT:    or a3, a7, a3
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    slli t1, a0, 1
+; RV32I-NEXT:    sll a6, t1, a6
+; RV32I-NEXT:    or a6, a5, a6
+; RV32I-NEXT:    srl a0, a0, a1
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a0, 12(a2)
 ; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a6, 1(a2)
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    sb a1, 5(a2)
-; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 48
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 15(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    srli a0, a6, 16
+; RV32I-NEXT:    sb a0, 10(a2)
+; RV32I-NEXT:    srli a0, a6, 24
+; RV32I-NEXT:    sb a0, 11(a2)
+; RV32I-NEXT:    srli a0, a6, 8
+; RV32I-NEXT:    sb a0, 9(a2)
+; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    sb a0, 2(a2)
+; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    sb a0, 3(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a0, t0, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, t0, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a0, t0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -823,6 +842,222 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
   store i128 %res, ptr %dst, align 1
   ret void
 }
+
+define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_16bytes_wordOff:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lbu a3, 9(a0)
+; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a5, 10(a0)
+; RV64I-NEXT:    lbu a6, 11(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 13(a0)
+; RV64I-NEXT:    lbu a5, 12(a0)
+; RV64I-NEXT:    lbu a6, 14(a0)
+; RV64I-NEXT:    lbu a7, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a1)
+; RV64I-NEXT:    lbu a5, 4(a1)
+; RV64I-NEXT:    lbu a6, 6(a1)
+; RV64I-NEXT:    lbu a7, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 1(a1)
+; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu a7, 2(a1)
+; RV64I-NEXT:    lbu a1, 3(a1)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a1, a1, a5
+; RV64I-NEXT:    slli a1, a1, 5
+; RV64I-NEXT:    slli a4, a4, 37
+; RV64I-NEXT:    or a5, a4, a1
+; RV64I-NEXT:    addi a4, a5, -64
+; RV64I-NEXT:    srl a1, a3, a5
+; RV64I-NEXT:    bltz a4, .LBB7_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    j .LBB7_3
+; RV64I-NEXT:  .LBB7_2:
+; RV64I-NEXT:    lbu a6, 1(a0)
+; RV64I-NEXT:    lbu a7, 0(a0)
+; RV64I-NEXT:    lbu t0, 2(a0)
+; RV64I-NEXT:    lbu t1, 3(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 5(a0)
+; RV64I-NEXT:    lbu t0, 4(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    srl a0, a0, a5
+; RV64I-NEXT:    not a5, a5
+; RV64I-NEXT:    slli a3, a3, 1
+; RV64I-NEXT:    sll a3, a3, a5
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:  .LBB7_3:
+; RV64I-NEXT:    srai a4, a4, 63
+; RV64I-NEXT:    and a1, a4, a1
+; RV64I-NEXT:    sb a1, 8(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 15(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 14(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 13(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 12(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 11(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 10(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 9(a2)
+; RV64I-NEXT:    sb a0, 0(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 5(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 4(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 1(a2)
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: lshr_16bytes_wordOff:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    andi a1, a1, 12
+; RV32I-NEXT:    mv a0, sp
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    lw a1, 8(a0)
+; RV32I-NEXT:    lw a3, 12(a0)
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a0, 4(a0)
+; RV32I-NEXT:    sb a1, 8(a2)
+; RV32I-NEXT:    sb a3, 12(a2)
+; RV32I-NEXT:    sb a4, 0(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli a5, a1, 16
+; RV32I-NEXT:    sb a5, 10(a2)
+; RV32I-NEXT:    srli a5, a1, 24
+; RV32I-NEXT:    sb a5, 11(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 9(a2)
+; RV32I-NEXT:    srli a1, a3, 16
+; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    srli a1, a3, 24
+; RV32I-NEXT:    sb a1, 15(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    srli a1, a4, 16
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    srli a1, a4, 24
+; RV32I-NEXT:    sb a1, 3(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 1(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+  %src = load i128, ptr %src.ptr, align 1
+  %wordOff = load i128, ptr %wordOff.ptr, align 1
+  %bitOff = shl i128 %wordOff, 5
+  %res = lshr i128 %src, %bitOff
+  store i128 %res, ptr %dst, align 1
+  ret void
+}
+
 define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_16bytes:
 ; RV64I:       # %bb.0:
@@ -873,11 +1108,11 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a5, a4, a1
 ; RV64I-NEXT:    addi a4, a5, -64
 ; RV64I-NEXT:    sll a1, a3, a5
-; RV64I-NEXT:    bltz a4, .LBB7_2
+; RV64I-NEXT:    bltz a4, .LBB8_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:    j .LBB7_3
-; RV64I-NEXT:  .LBB7_2:
+; RV64I-NEXT:    j .LBB8_3
+; RV64I-NEXT:  .LBB8_2:
 ; RV64I-NEXT:    lbu a6, 9(a0)
 ; RV64I-NEXT:    lbu a7, 8(a0)
 ; RV64I-NEXT:    lbu t0, 10(a0)
@@ -905,7 +1140,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    srli a3, a3, 1
 ; RV64I-NEXT:    srl a3, a3, a5
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:  .LBB7_3:
+; RV64I-NEXT:  .LBB8_3:
 ; RV64I-NEXT:    srai a4, a4, 63
 ; RV64I-NEXT:    and a1, a4, a1
 ; RV64I-NEXT:    sb a1, 0(a2)
@@ -942,98 +1177,117 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: shl_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -48
-; RV32I-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sb zero, 19(sp)
-; RV32I-NEXT:    sb zero, 18(sp)
-; RV32I-NEXT:    sb zero, 17(sp)
-; RV32I-NEXT:    sb zero, 16(sp)
-; RV32I-NEXT:    sb zero, 15(sp)
-; RV32I-NEXT:    sb zero, 14(sp)
-; RV32I-NEXT:    sb zero, 13(sp)
-; RV32I-NEXT:    sb zero, 12(sp)
-; RV32I-NEXT:    sb zero, 11(sp)
-; RV32I-NEXT:    sb zero, 10(sp)
-; RV32I-NEXT:    sb zero, 9(sp)
-; RV32I-NEXT:    sb zero, 8(sp)
-; RV32I-NEXT:    sb zero, 7(sp)
-; RV32I-NEXT:    sb zero, 6(sp)
-; RV32I-NEXT:    sb zero, 5(sp)
-; RV32I-NEXT:    sb zero, 4(sp)
-; RV32I-NEXT:    sb a0, 35(sp)
-; RV32I-NEXT:    sb s2, 34(sp)
-; RV32I-NEXT:    sb s1, 33(sp)
-; RV32I-NEXT:    sb s0, 32(sp)
-; RV32I-NEXT:    sb t6, 31(sp)
-; RV32I-NEXT:    sb t5, 30(sp)
-; RV32I-NEXT:    sb t4, 29(sp)
-; RV32I-NEXT:    sb t3, 28(sp)
-; RV32I-NEXT:    sb t2, 27(sp)
-; RV32I-NEXT:    sb t1, 26(sp)
-; RV32I-NEXT:    sb t0, 25(sp)
-; RV32I-NEXT:    sb a7, 24(sp)
-; RV32I-NEXT:    sb a6, 23(sp)
-; RV32I-NEXT:    sb a5, 22(sp)
-; RV32I-NEXT:    sb a4, 21(sp)
-; RV32I-NEXT:    sb a3, 20(sp)
-; RV32I-NEXT:    andi a1, a1, 15
-; RV32I-NEXT:    addi a0, sp, 20
-; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 7(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 12(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu t5, 14(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 8(a0)
-; RV32I-NEXT:    lbu a0, 9(a0)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu a7, 0(a1)
+; RV32I-NEXT:    lbu t0, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw a5, 24(sp)
+; RV32I-NEXT:    sw a4, 20(sp)
+; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    andi a0, a1, 12
+; RV32I-NEXT:    addi a3, sp, 16
+; RV32I-NEXT:    sub a3, a3, a0
+; RV32I-NEXT:    lw a0, 4(a3)
+; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    lw a4, 0(a3)
+; RV32I-NEXT:    sll a5, a0, a1
+; RV32I-NEXT:    andi a6, a1, 24
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    srli a7, a4, 1
+; RV32I-NEXT:    lw t0, 12(a3)
+; RV32I-NEXT:    lw a3, 8(a3)
+; RV32I-NEXT:    srl a7, a7, a6
+; RV32I-NEXT:    or a7, a5, a7
+; RV32I-NEXT:    sll t0, t0, a1
+; RV32I-NEXT:    srli t1, a3, 1
+; RV32I-NEXT:    srl t1, t1, a6
+; RV32I-NEXT:    or t1, t0, t1
+; RV32I-NEXT:    sll a3, a3, a1
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    srl a0, a0, a6
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    sll a1, a4, a1
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli a3, a3, 24
+; RV32I-NEXT:    sb a3, 11(a2)
+; RV32I-NEXT:    srli a3, t0, 24
+; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a5, a5, 24
+; RV32I-NEXT:    sb a5, 7(a2)
+; RV32I-NEXT:    sb a0, 8(a2)
+; RV32I-NEXT:    sb t1, 12(a2)
+; RV32I-NEXT:    sb a7, 4(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 10(a2)
+; RV32I-NEXT:    srli a0, a0, 8
 ; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    sb t5, 14(a2)
-; RV32I-NEXT:    sb t4, 15(a2)
-; RV32I-NEXT:    sb t3, 12(a2)
-; RV32I-NEXT:    sb t2, 13(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a6, 1(a2)
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    sb a1, 5(a2)
-; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 48
+; RV32I-NEXT:    srli a0, t1, 16
+; RV32I-NEXT:    sb a0, 14(a2)
+; RV32I-NEXT:    srli a0, t1, 8
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    srli a0, a7, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, a7, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -1042,6 +1296,223 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
   store i128 %res, ptr %dst, align 1
   ret void
 }
+
+define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_16bytes_wordOff:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a1)
+; RV64I-NEXT:    lbu a5, 4(a1)
+; RV64I-NEXT:    lbu a6, 6(a1)
+; RV64I-NEXT:    lbu a7, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 1(a1)
+; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu a7, 2(a1)
+; RV64I-NEXT:    lbu a1, 3(a1)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    or a1, a1, a5
+; RV64I-NEXT:    slli a1, a1, 5
+; RV64I-NEXT:    slli a4, a4, 37
+; RV64I-NEXT:    or a5, a4, a1
+; RV64I-NEXT:    addi a4, a5, -64
+; RV64I-NEXT:    sll a1, a3, a5
+; RV64I-NEXT:    bltz a4, .LBB9_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    j .LBB9_3
+; RV64I-NEXT:  .LBB9_2:
+; RV64I-NEXT:    lbu a6, 9(a0)
+; RV64I-NEXT:    lbu a7, 8(a0)
+; RV64I-NEXT:    lbu t0, 10(a0)
+; RV64I-NEXT:    lbu t1, 11(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 13(a0)
+; RV64I-NEXT:    lbu t0, 12(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu a0, 15(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    sll a0, a0, a5
+; RV64I-NEXT:    not a5, a5
+; RV64I-NEXT:    srli a3, a3, 1
+; RV64I-NEXT:    srl a3, a3, a5
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:  .LBB9_3:
+; RV64I-NEXT:    srai a4, a4, 63
+; RV64I-NEXT:    and a1, a4, a1
+; RV64I-NEXT:    sb a1, 0(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 7(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 6(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 5(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 4(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 3(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 2(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 1(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: shl_16bytes_wordOff:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw a5, 24(sp)
+; RV32I-NEXT:    sw a4, 20(sp)
+; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    andi a1, a1, 12
+; RV32I-NEXT:    addi a0, sp, 16
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    lw a1, 8(a0)
+; RV32I-NEXT:    lw a3, 12(a0)
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a0, 4(a0)
+; RV32I-NEXT:    sb a1, 8(a2)
+; RV32I-NEXT:    sb a3, 12(a2)
+; RV32I-NEXT:    sb a4, 0(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli a5, a1, 16
+; RV32I-NEXT:    sb a5, 10(a2)
+; RV32I-NEXT:    srli a5, a1, 24
+; RV32I-NEXT:    sb a5, 11(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 9(a2)
+; RV32I-NEXT:    srli a1, a3, 16
+; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    srli a1, a3, 24
+; RV32I-NEXT:    sb a1, 15(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    srli a1, a4, 16
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    srli a1, a4, 24
+; RV32I-NEXT:    sb a1, 3(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 1(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+  %src = load i128, ptr %src.ptr, align 1
+  %wordOff = load i128, ptr %wordOff.ptr, align 1
+  %bitOff = shl i128 %wordOff, 5
+  %res = shl i128 %src, %bitOff
+  store i128 %res, ptr %dst, align 1
+  ret void
+}
+
+
 define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_16bytes:
 ; RV64I:       # %bb.0:
@@ -1092,13 +1563,13 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a5, a5, a1
 ; RV64I-NEXT:    addi a6, a5, -64
 ; RV64I-NEXT:    sra a1, a3, a5
-; RV64I-NEXT:    bltz a6, .LBB8_2
+; RV64I-NEXT:    bltz a6, .LBB10_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    sraiw a3, a4, 31
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    mv a1, a3
-; RV64I-NEXT:    j .LBB8_3
-; RV64I-NEXT:  .LBB8_2:
+; RV64I-NEXT:    j .LBB10_3
+; RV64I-NEXT:  .LBB10_2:
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a6, 0(a0)
 ; RV64I-NEXT:    lbu a7, 2(a0)
@@ -1126,7 +1597,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a3, a3, 1
 ; RV64I-NEXT:    sll a3, a3, a4
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:  .LBB8_3:
+; RV64I-NEXT:  .LBB10_3:
 ; RV64I-NEXT:    sb a1, 8(a2)
 ; RV64I-NEXT:    srli a3, a1, 56
 ; RV64I-NEXT:    sb a3, 15(a2)
@@ -1161,105 +1632,118 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: ashr_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -48
-; RV32I-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 15(a0)
-; RV32I-NEXT:    slli a4, a3, 24
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 2(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 4(a0)
-; RV32I-NEXT:    lbu t2, 5(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t4, 7(a0)
-; RV32I-NEXT:    lbu t5, 8(a0)
-; RV32I-NEXT:    lbu t6, 9(a0)
-; RV32I-NEXT:    lbu s0, 10(a0)
-; RV32I-NEXT:    lbu s1, 11(a0)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 14(a0)
-; RV32I-NEXT:    lbu a0, 13(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sb a3, 15(sp)
-; RV32I-NEXT:    sb s3, 14(sp)
-; RV32I-NEXT:    sb a0, 13(sp)
-; RV32I-NEXT:    sb s2, 12(sp)
-; RV32I-NEXT:    sb s1, 11(sp)
-; RV32I-NEXT:    sb s0, 10(sp)
-; RV32I-NEXT:    sb t6, 9(sp)
-; RV32I-NEXT:    sb t5, 8(sp)
-; RV32I-NEXT:    sb t4, 7(sp)
-; RV32I-NEXT:    sb t3, 6(sp)
-; RV32I-NEXT:    sb t2, 5(sp)
-; RV32I-NEXT:    sb t1, 4(sp)
-; RV32I-NEXT:    sb t0, 3(sp)
-; RV32I-NEXT:    sb a7, 2(sp)
-; RV32I-NEXT:    sb a6, 1(sp)
-; RV32I-NEXT:    sb a5, 0(sp)
-; RV32I-NEXT:    srai a4, a4, 31
-; RV32I-NEXT:    sb a4, 28(sp)
-; RV32I-NEXT:    sb a4, 24(sp)
-; RV32I-NEXT:    sb a4, 20(sp)
-; RV32I-NEXT:    sb a4, 16(sp)
-; RV32I-NEXT:    srli a0, a4, 24
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 30(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a0, 27(sp)
-; RV32I-NEXT:    sb a3, 26(sp)
-; RV32I-NEXT:    sb a4, 25(sp)
-; RV32I-NEXT:    sb a0, 23(sp)
-; RV32I-NEXT:    sb a3, 22(sp)
-; RV32I-NEXT:    sb a4, 21(sp)
-; RV32I-NEXT:    sb a0, 19(sp)
-; RV32I-NEXT:    sb a3, 18(sp)
-; RV32I-NEXT:    sb a4, 17(sp)
-; RV32I-NEXT:    andi a1, a1, 15
-; RV32I-NEXT:    mv a0, sp
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 7(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 0(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 2(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 12(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu t5, 14(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 8(a0)
-; RV32I-NEXT:    lbu a0, 9(a0)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb a0, 9(a2)
-; RV32I-NEXT:    sb t5, 14(a2)
-; RV32I-NEXT:    sb t4, 15(a2)
-; RV32I-NEXT:    sb t3, 12(a2)
-; RV32I-NEXT:    sb t2, 13(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a7, a0, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    lbu t0, 0(a1)
+; RV32I-NEXT:    lbu t1, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t1
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    srai a0, a0, 31
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw a0, 24(sp)
+; RV32I-NEXT:    sw a0, 20(sp)
+; RV32I-NEXT:    sw a0, 16(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    andi a0, a1, 12
+; RV32I-NEXT:    mv a3, sp
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    lw a3, 4(a0)
+; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    srl a4, a3, a1
+; RV32I-NEXT:    lw a5, 8(a0)
+; RV32I-NEXT:    andi a6, a1, 24
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    lw a7, 0(a0)
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or t0, a4, t0
+; RV32I-NEXT:    srl a7, a7, a1
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    sll a3, a3, a6
+; RV32I-NEXT:    or a3, a7, a3
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    slli t1, a0, 1
+; RV32I-NEXT:    sll a6, t1, a6
+; RV32I-NEXT:    or a6, a5, a6
+; RV32I-NEXT:    sra a0, a0, a1
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a0, 12(a2)
 ; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a6, 1(a2)
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    sb a1, 5(a2)
-; RV32I-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 48
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 15(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    srli a0, a6, 16
+; RV32I-NEXT:    sb a0, 10(a2)
+; RV32I-NEXT:    srli a0, a6, 24
+; RV32I-NEXT:    sb a0, 11(a2)
+; RV32I-NEXT:    srli a0, a6, 8
+; RV32I-NEXT:    sb a0, 9(a2)
+; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    sb a0, 2(a2)
+; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    sb a0, 3(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a0, t0, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, t0, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a0, t0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -1269,1347 +1753,3730 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
   ret void
 }
 
-define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; RV64I-LABEL: lshr_32bytes:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -224
-; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t1, 6(a0)
-; RV64I-NEXT:    lbu t2, 7(a0)
-; RV64I-NEXT:    lbu t3, 8(a0)
-; RV64I-NEXT:    lbu t4, 9(a0)
-; RV64I-NEXT:    lbu t5, 10(a0)
-; RV64I-NEXT:    lbu t6, 11(a0)
-; RV64I-NEXT:    lbu s0, 12(a0)
-; RV64I-NEXT:    lbu s1, 13(a0)
-; RV64I-NEXT:    lbu s2, 14(a0)
-; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    lbu s4, 16(a0)
-; RV64I-NEXT:    lbu s5, 17(a0)
-; RV64I-NEXT:    lbu s6, 18(a0)
-; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    lbu s10, 22(a0)
-; RV64I-NEXT:    lbu s11, 23(a0)
-; RV64I-NEXT:    lbu ra, 24(a0)
-; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    lbu a7, 26(a0)
-; RV64I-NEXT:    lbu a6, 27(a0)
-; RV64I-NEXT:    lbu a5, 28(a0)
-; RV64I-NEXT:    lbu a3, 31(a0)
-; RV64I-NEXT:    lbu a4, 30(a0)
-; RV64I-NEXT:    lbu a0, 29(a0)
-; RV64I-NEXT:    lbu a1, 0(a1)
-; RV64I-NEXT:    sb a3, 87(sp)
-; RV64I-NEXT:    sb a4, 86(sp)
-; RV64I-NEXT:    sb a0, 85(sp)
-; RV64I-NEXT:    sb a5, 84(sp)
-; RV64I-NEXT:    sb a6, 83(sp)
-; RV64I-NEXT:    sb a7, 82(sp)
-; RV64I-NEXT:    sb zero, 119(sp)
-; RV64I-NEXT:    sb zero, 118(sp)
-; RV64I-NEXT:    sb zero, 117(sp)
-; RV64I-NEXT:    sb zero, 116(sp)
-; RV64I-NEXT:    sb zero, 115(sp)
-; RV64I-NEXT:    sb zero, 114(sp)
-; RV64I-NEXT:    sb zero, 113(sp)
-; RV64I-NEXT:    sb zero, 112(sp)
-; RV64I-NEXT:    sb zero, 111(sp)
-; RV64I-NEXT:    sb zero, 110(sp)
-; RV64I-NEXT:    sb zero, 109(sp)
-; RV64I-NEXT:    sb zero, 108(sp)
-; RV64I-NEXT:    sb zero, 107(sp)
-; RV64I-NEXT:    sb zero, 106(sp)
-; RV64I-NEXT:    sb zero, 105(sp)
-; RV64I-NEXT:    sb zero, 104(sp)
-; RV64I-NEXT:    sb zero, 103(sp)
-; RV64I-NEXT:    sb zero, 102(sp)
-; RV64I-NEXT:    sb zero, 101(sp)
-; RV64I-NEXT:    sb zero, 100(sp)
-; RV64I-NEXT:    sb zero, 99(sp)
-; RV64I-NEXT:    sb zero, 98(sp)
-; RV64I-NEXT:    sb zero, 97(sp)
-; RV64I-NEXT:    sb zero, 96(sp)
-; RV64I-NEXT:    sb zero, 95(sp)
-; RV64I-NEXT:    sb zero, 94(sp)
-; RV64I-NEXT:    sb zero, 93(sp)
-; RV64I-NEXT:    sb zero, 92(sp)
-; RV64I-NEXT:    sb zero, 91(sp)
-; RV64I-NEXT:    sb zero, 90(sp)
-; RV64I-NEXT:    sb zero, 89(sp)
-; RV64I-NEXT:    sb zero, 88(sp)
-; RV64I-NEXT:    sb t0, 81(sp)
-; RV64I-NEXT:    sb ra, 80(sp)
-; RV64I-NEXT:    sb s11, 79(sp)
-; RV64I-NEXT:    sb s10, 78(sp)
-; RV64I-NEXT:    sb s9, 77(sp)
-; RV64I-NEXT:    sb s8, 76(sp)
-; RV64I-NEXT:    sb s7, 75(sp)
-; RV64I-NEXT:    sb s6, 74(sp)
-; RV64I-NEXT:    sb s5, 73(sp)
-; RV64I-NEXT:    sb s4, 72(sp)
-; RV64I-NEXT:    sb s3, 71(sp)
-; RV64I-NEXT:    sb s2, 70(sp)
-; RV64I-NEXT:    sb s1, 69(sp)
-; RV64I-NEXT:    sb s0, 68(sp)
-; RV64I-NEXT:    sb t6, 67(sp)
-; RV64I-NEXT:    sb t5, 66(sp)
-; RV64I-NEXT:    sb t4, 65(sp)
-; RV64I-NEXT:    sb t3, 64(sp)
-; RV64I-NEXT:    sb t2, 63(sp)
-; RV64I-NEXT:    sb t1, 62(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 61(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 60(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 59(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 58(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 57(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    andi a1, a1, 31
-; RV64I-NEXT:    addi a0, sp, 56
-; RV64I-NEXT:    add a6, a0, a1
-; RV64I-NEXT:    lbu a0, 8(a6)
-; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 9(a6)
-; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 10(a6)
-; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 11(a6)
-; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 12(a6)
-; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a7, 13(a6)
-; RV64I-NEXT:    lbu t0, 14(a6)
-; RV64I-NEXT:    lbu t1, 15(a6)
-; RV64I-NEXT:    lbu t2, 0(a6)
-; RV64I-NEXT:    lbu t3, 1(a6)
-; RV64I-NEXT:    lbu t4, 2(a6)
-; RV64I-NEXT:    lbu t5, 3(a6)
-; RV64I-NEXT:    lbu t6, 4(a6)
-; RV64I-NEXT:    lbu s0, 5(a6)
-; RV64I-NEXT:    lbu s1, 6(a6)
-; RV64I-NEXT:    lbu s2, 7(a6)
-; RV64I-NEXT:    lbu s3, 24(a6)
-; RV64I-NEXT:    lbu s4, 25(a6)
-; RV64I-NEXT:    lbu s5, 26(a6)
-; RV64I-NEXT:    lbu s6, 27(a6)
-; RV64I-NEXT:    lbu s7, 28(a6)
-; RV64I-NEXT:    lbu s8, 29(a6)
-; RV64I-NEXT:    lbu s9, 30(a6)
-; RV64I-NEXT:    lbu s10, 31(a6)
-; RV64I-NEXT:    lbu s11, 16(a6)
-; RV64I-NEXT:    lbu ra, 17(a6)
-; RV64I-NEXT:    lbu a5, 18(a6)
-; RV64I-NEXT:    lbu a4, 19(a6)
-; RV64I-NEXT:    lbu a0, 23(a6)
-; RV64I-NEXT:    lbu a1, 22(a6)
-; RV64I-NEXT:    lbu a3, 21(a6)
-; RV64I-NEXT:    lbu a6, 20(a6)
-; RV64I-NEXT:    sb a0, 23(a2)
-; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    sb a3, 21(a2)
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb ra, 17(a2)
-; RV64I-NEXT:    sb s11, 16(a2)
-; RV64I-NEXT:    sb s10, 31(a2)
-; RV64I-NEXT:    sb s9, 30(a2)
-; RV64I-NEXT:    sb s8, 29(a2)
-; RV64I-NEXT:    sb s7, 28(a2)
-; RV64I-NEXT:    sb s6, 27(a2)
-; RV64I-NEXT:    sb s5, 26(a2)
-; RV64I-NEXT:    sb s4, 25(a2)
-; RV64I-NEXT:    sb s3, 24(a2)
-; RV64I-NEXT:    sb s2, 7(a2)
-; RV64I-NEXT:    sb s1, 6(a2)
-; RV64I-NEXT:    sb s0, 5(a2)
-; RV64I-NEXT:    sb t6, 4(a2)
-; RV64I-NEXT:    sb t5, 3(a2)
-; RV64I-NEXT:    sb t4, 2(a2)
-; RV64I-NEXT:    sb t3, 1(a2)
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb t1, 15(a2)
-; RV64I-NEXT:    sb t0, 14(a2)
-; RV64I-NEXT:    sb a7, 13(a2)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 12(a2)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 11(a2)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 10(a2)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 9(a2)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 224
-; RV64I-NEXT:    ret
-;
-; RV32I-LABEL: lshr_32bytes:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -144
-; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
-; RV32I-NEXT:    lbu s3, 15(a0)
-; RV32I-NEXT:    lbu s4, 16(a0)
-; RV32I-NEXT:    lbu s5, 17(a0)
-; RV32I-NEXT:    lbu s6, 18(a0)
-; RV32I-NEXT:    lbu s7, 19(a0)
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s10, 22(a0)
-; RV32I-NEXT:    lbu s11, 23(a0)
-; RV32I-NEXT:    lbu ra, 24(a0)
-; RV32I-NEXT:    lbu t0, 25(a0)
-; RV32I-NEXT:    lbu a7, 26(a0)
-; RV32I-NEXT:    lbu a6, 27(a0)
-; RV32I-NEXT:    lbu a5, 28(a0)
-; RV32I-NEXT:    lbu a3, 31(a0)
-; RV32I-NEXT:    lbu a4, 30(a0)
-; RV32I-NEXT:    lbu a0, 29(a0)
-; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sb a3, 59(sp)
-; RV32I-NEXT:    sb a4, 58(sp)
-; RV32I-NEXT:    sb a0, 57(sp)
-; RV32I-NEXT:    sb a5, 56(sp)
-; RV32I-NEXT:    sb a6, 55(sp)
-; RV32I-NEXT:    sb a7, 54(sp)
-; RV32I-NEXT:    sb zero, 91(sp)
-; RV32I-NEXT:    sb zero, 90(sp)
-; RV32I-NEXT:    sb zero, 89(sp)
-; RV32I-NEXT:    sb zero, 88(sp)
-; RV32I-NEXT:    sb zero, 87(sp)
-; RV32I-NEXT:    sb zero, 86(sp)
-; RV32I-NEXT:    sb zero, 85(sp)
-; RV32I-NEXT:    sb zero, 84(sp)
-; RV32I-NEXT:    sb zero, 83(sp)
-; RV32I-NEXT:    sb zero, 82(sp)
-; RV32I-NEXT:    sb zero, 81(sp)
-; RV32I-NEXT:    sb zero, 80(sp)
-; RV32I-NEXT:    sb zero, 79(sp)
-; RV32I-NEXT:    sb zero, 78(sp)
-; RV32I-NEXT:    sb zero, 77(sp)
-; RV32I-NEXT:    sb zero, 76(sp)
-; RV32I-NEXT:    sb zero, 75(sp)
-; RV32I-NEXT:    sb zero, 74(sp)
-; RV32I-NEXT:    sb zero, 73(sp)
-; RV32I-NEXT:    sb zero, 72(sp)
-; RV32I-NEXT:    sb zero, 71(sp)
-; RV32I-NEXT:    sb zero, 70(sp)
-; RV32I-NEXT:    sb zero, 69(sp)
-; RV32I-NEXT:    sb zero, 68(sp)
-; RV32I-NEXT:    sb zero, 67(sp)
-; RV32I-NEXT:    sb zero, 66(sp)
-; RV32I-NEXT:    sb zero, 65(sp)
-; RV32I-NEXT:    sb zero, 64(sp)
-; RV32I-NEXT:    sb zero, 63(sp)
-; RV32I-NEXT:    sb zero, 62(sp)
-; RV32I-NEXT:    sb zero, 61(sp)
-; RV32I-NEXT:    sb zero, 60(sp)
-; RV32I-NEXT:    sb t0, 53(sp)
-; RV32I-NEXT:    sb ra, 52(sp)
-; RV32I-NEXT:    sb s11, 51(sp)
-; RV32I-NEXT:    sb s10, 50(sp)
-; RV32I-NEXT:    sb s9, 49(sp)
-; RV32I-NEXT:    sb s8, 48(sp)
-; RV32I-NEXT:    sb s7, 47(sp)
-; RV32I-NEXT:    sb s6, 46(sp)
-; RV32I-NEXT:    sb s5, 45(sp)
-; RV32I-NEXT:    sb s4, 44(sp)
-; RV32I-NEXT:    sb s3, 43(sp)
-; RV32I-NEXT:    sb s2, 42(sp)
-; RV32I-NEXT:    sb s1, 41(sp)
-; RV32I-NEXT:    sb s0, 40(sp)
-; RV32I-NEXT:    sb t6, 39(sp)
-; RV32I-NEXT:    sb t5, 38(sp)
-; RV32I-NEXT:    sb t4, 37(sp)
-; RV32I-NEXT:    sb t3, 36(sp)
-; RV32I-NEXT:    sb t2, 35(sp)
-; RV32I-NEXT:    sb t1, 34(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    andi a1, a1, 31
-; RV32I-NEXT:    addi a0, sp, 28
-; RV32I-NEXT:    add a6, a0, a1
-; RV32I-NEXT:    lbu a0, 6(a6)
-; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 7(a6)
-; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 4(a6)
-; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 5(a6)
-; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 0(a6)
-; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 1(a6)
-; RV32I-NEXT:    lbu t0, 2(a6)
-; RV32I-NEXT:    lbu t1, 3(a6)
-; RV32I-NEXT:    lbu t2, 14(a6)
-; RV32I-NEXT:    lbu t3, 15(a6)
-; RV32I-NEXT:    lbu t4, 12(a6)
-; RV32I-NEXT:    lbu t5, 13(a6)
-; RV32I-NEXT:    lbu t6, 10(a6)
-; RV32I-NEXT:    lbu s0, 11(a6)
-; RV32I-NEXT:    lbu s1, 8(a6)
-; RV32I-NEXT:    lbu s2, 9(a6)
-; RV32I-NEXT:    lbu s3, 22(a6)
-; RV32I-NEXT:    lbu s4, 23(a6)
-; RV32I-NEXT:    lbu s5, 20(a6)
-; RV32I-NEXT:    lbu s6, 21(a6)
-; RV32I-NEXT:    lbu s7, 18(a6)
-; RV32I-NEXT:    lbu s8, 19(a6)
-; RV32I-NEXT:    lbu s9, 16(a6)
-; RV32I-NEXT:    lbu s10, 17(a6)
-; RV32I-NEXT:    lbu s11, 30(a6)
-; RV32I-NEXT:    lbu ra, 31(a6)
-; RV32I-NEXT:    lbu a5, 28(a6)
-; RV32I-NEXT:    lbu a4, 29(a6)
-; RV32I-NEXT:    lbu a0, 25(a6)
-; RV32I-NEXT:    lbu a1, 24(a6)
-; RV32I-NEXT:    lbu a3, 27(a6)
-; RV32I-NEXT:    lbu a6, 26(a6)
-; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    sb a1, 24(a2)
-; RV32I-NEXT:    sb a3, 27(a2)
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    sb a4, 29(a2)
-; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    sb ra, 31(a2)
-; RV32I-NEXT:    sb s11, 30(a2)
-; RV32I-NEXT:    sb s10, 17(a2)
-; RV32I-NEXT:    sb s9, 16(a2)
-; RV32I-NEXT:    sb s8, 19(a2)
-; RV32I-NEXT:    sb s7, 18(a2)
-; RV32I-NEXT:    sb s6, 21(a2)
-; RV32I-NEXT:    sb s5, 20(a2)
-; RV32I-NEXT:    sb s4, 23(a2)
-; RV32I-NEXT:    sb s3, 22(a2)
-; RV32I-NEXT:    sb s2, 9(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb t4, 12(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 144
-; RV32I-NEXT:    ret
-  %src = load i256, ptr %src.ptr, align 1
-  %byteOff = load i256, ptr %byteOff.ptr, align 1
-  %bitOff = shl i256 %byteOff, 3
-  %res = lshr i256 %src, %bitOff
-  store i256 %res, ptr %dst, align 1
-  ret void
-}
-define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; RV64I-LABEL: shl_32bytes:
+define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_16bytes_wordOff:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -224
-; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t1, 6(a0)
-; RV64I-NEXT:    lbu t2, 7(a0)
-; RV64I-NEXT:    lbu t3, 8(a0)
-; RV64I-NEXT:    lbu t4, 9(a0)
-; RV64I-NEXT:    lbu t5, 10(a0)
-; RV64I-NEXT:    lbu t6, 11(a0)
-; RV64I-NEXT:    lbu s0, 12(a0)
-; RV64I-NEXT:    lbu s1, 13(a0)
-; RV64I-NEXT:    lbu s2, 14(a0)
-; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    lbu s4, 16(a0)
-; RV64I-NEXT:    lbu s5, 17(a0)
-; RV64I-NEXT:    lbu s6, 18(a0)
-; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    lbu s10, 22(a0)
-; RV64I-NEXT:    lbu s11, 23(a0)
-; RV64I-NEXT:    lbu ra, 24(a0)
-; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    lbu a7, 26(a0)
-; RV64I-NEXT:    lbu a6, 27(a0)
-; RV64I-NEXT:    lbu a5, 28(a0)
-; RV64I-NEXT:    lbu a3, 31(a0)
-; RV64I-NEXT:    lbu a4, 30(a0)
-; RV64I-NEXT:    lbu a0, 29(a0)
-; RV64I-NEXT:    lbu a1, 0(a1)
-; RV64I-NEXT:    sb a3, 119(sp)
-; RV64I-NEXT:    sb a4, 118(sp)
-; RV64I-NEXT:    sb a0, 117(sp)
-; RV64I-NEXT:    sb a5, 116(sp)
-; RV64I-NEXT:    sb a6, 115(sp)
-; RV64I-NEXT:    sb a7, 114(sp)
-; RV64I-NEXT:    sb zero, 87(sp)
-; RV64I-NEXT:    sb zero, 86(sp)
-; RV64I-NEXT:    sb zero, 85(sp)
-; RV64I-NEXT:    sb zero, 84(sp)
-; RV64I-NEXT:    sb zero, 83(sp)
-; RV64I-NEXT:    sb zero, 82(sp)
-; RV64I-NEXT:    sb zero, 81(sp)
-; RV64I-NEXT:    sb zero, 80(sp)
-; RV64I-NEXT:    sb zero, 79(sp)
-; RV64I-NEXT:    sb zero, 78(sp)
-; RV64I-NEXT:    sb zero, 77(sp)
-; RV64I-NEXT:    sb zero, 76(sp)
-; RV64I-NEXT:    sb zero, 75(sp)
-; RV64I-NEXT:    sb zero, 74(sp)
-; RV64I-NEXT:    sb zero, 73(sp)
-; RV64I-NEXT:    sb zero, 72(sp)
-; RV64I-NEXT:    sb zero, 71(sp)
-; RV64I-NEXT:    sb zero, 70(sp)
-; RV64I-NEXT:    sb zero, 69(sp)
-; RV64I-NEXT:    sb zero, 68(sp)
-; RV64I-NEXT:    sb zero, 67(sp)
-; RV64I-NEXT:    sb zero, 66(sp)
-; RV64I-NEXT:    sb zero, 65(sp)
-; RV64I-NEXT:    sb zero, 64(sp)
-; RV64I-NEXT:    sb zero, 63(sp)
-; RV64I-NEXT:    sb zero, 62(sp)
-; RV64I-NEXT:    sb zero, 61(sp)
-; RV64I-NEXT:    sb zero, 60(sp)
-; RV64I-NEXT:    sb zero, 59(sp)
-; RV64I-NEXT:    sb zero, 58(sp)
-; RV64I-NEXT:    sb zero, 57(sp)
-; RV64I-NEXT:    sb zero, 56(sp)
-; RV64I-NEXT:    sb t0, 113(sp)
-; RV64I-NEXT:    sb ra, 112(sp)
-; RV64I-NEXT:    sb s11, 111(sp)
-; RV64I-NEXT:    sb s10, 110(sp)
-; RV64I-NEXT:    sb s9, 109(sp)
-; RV64I-NEXT:    sb s8, 108(sp)
-; RV64I-NEXT:    sb s7, 107(sp)
-; RV64I-NEXT:    sb s6, 106(sp)
-; RV64I-NEXT:    sb s5, 105(sp)
-; RV64I-NEXT:    sb s4, 104(sp)
-; RV64I-NEXT:    sb s3, 103(sp)
-; RV64I-NEXT:    sb s2, 102(sp)
-; RV64I-NEXT:    sb s1, 101(sp)
-; RV64I-NEXT:    sb s0, 100(sp)
-; RV64I-NEXT:    sb t6, 99(sp)
-; RV64I-NEXT:    sb t5, 98(sp)
-; RV64I-NEXT:    sb t4, 97(sp)
-; RV64I-NEXT:    sb t3, 96(sp)
-; RV64I-NEXT:    sb t2, 95(sp)
-; RV64I-NEXT:    sb t1, 94(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 93(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 92(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 91(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 90(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 89(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 88(sp)
-; RV64I-NEXT:    andi a1, a1, 31
-; RV64I-NEXT:    addi a0, sp, 88
-; RV64I-NEXT:    sub a6, a0, a1
-; RV64I-NEXT:    lbu a0, 8(a6)
-; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 9(a6)
-; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 10(a6)
-; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 11(a6)
-; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 12(a6)
-; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a7, 13(a6)
-; RV64I-NEXT:    lbu t0, 14(a6)
-; RV64I-NEXT:    lbu t1, 15(a6)
-; RV64I-NEXT:    lbu t2, 0(a6)
-; RV64I-NEXT:    lbu t3, 1(a6)
-; RV64I-NEXT:    lbu t4, 2(a6)
-; RV64I-NEXT:    lbu t5, 3(a6)
-; RV64I-NEXT:    lbu t6, 4(a6)
-; RV64I-NEXT:    lbu s0, 5(a6)
-; RV64I-NEXT:    lbu s1, 6(a6)
-; RV64I-NEXT:    lbu s2, 7(a6)
-; RV64I-NEXT:    lbu s3, 24(a6)
-; RV64I-NEXT:    lbu s4, 25(a6)
-; RV64I-NEXT:    lbu s5, 26(a6)
-; RV64I-NEXT:    lbu s6, 27(a6)
-; RV64I-NEXT:    lbu s7, 28(a6)
-; RV64I-NEXT:    lbu s8, 29(a6)
-; RV64I-NEXT:    lbu s9, 30(a6)
-; RV64I-NEXT:    lbu s10, 31(a6)
-; RV64I-NEXT:    lbu s11, 16(a6)
-; RV64I-NEXT:    lbu ra, 17(a6)
-; RV64I-NEXT:    lbu a5, 18(a6)
-; RV64I-NEXT:    lbu a4, 19(a6)
-; RV64I-NEXT:    lbu a0, 23(a6)
-; RV64I-NEXT:    lbu a1, 22(a6)
-; RV64I-NEXT:    lbu a3, 21(a6)
-; RV64I-NEXT:    lbu a6, 20(a6)
-; RV64I-NEXT:    sb a0, 23(a2)
-; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    sb a3, 21(a2)
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb ra, 17(a2)
-; RV64I-NEXT:    sb s11, 16(a2)
-; RV64I-NEXT:    sb s10, 31(a2)
-; RV64I-NEXT:    sb s9, 30(a2)
-; RV64I-NEXT:    sb s8, 29(a2)
-; RV64I-NEXT:    sb s7, 28(a2)
-; RV64I-NEXT:    sb s6, 27(a2)
-; RV64I-NEXT:    sb s5, 26(a2)
-; RV64I-NEXT:    sb s4, 25(a2)
-; RV64I-NEXT:    sb s3, 24(a2)
-; RV64I-NEXT:    sb s2, 7(a2)
-; RV64I-NEXT:    sb s1, 6(a2)
-; RV64I-NEXT:    sb s0, 5(a2)
-; RV64I-NEXT:    sb t6, 4(a2)
-; RV64I-NEXT:    sb t5, 3(a2)
-; RV64I-NEXT:    sb t4, 2(a2)
-; RV64I-NEXT:    sb t3, 1(a2)
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb t1, 15(a2)
-; RV64I-NEXT:    sb t0, 14(a2)
-; RV64I-NEXT:    sb a7, 13(a2)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    lbu a3, 9(a0)
+; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a5, 10(a0)
+; RV64I-NEXT:    lbu a6, 11(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 13(a0)
+; RV64I-NEXT:    lbu a5, 12(a0)
+; RV64I-NEXT:    lbu a6, 14(a0)
+; RV64I-NEXT:    lbu a7, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    lbu a5, 5(a1)
+; RV64I-NEXT:    lbu a6, 4(a1)
+; RV64I-NEXT:    lbu a7, 6(a1)
+; RV64I-NEXT:    lbu t0, 7(a1)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu a1, 3(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    slli a1, a1, 5
+; RV64I-NEXT:    slli a5, a5, 37
+; RV64I-NEXT:    or a5, a5, a1
+; RV64I-NEXT:    addi a6, a5, -64
+; RV64I-NEXT:    sra a1, a3, a5
+; RV64I-NEXT:    bltz a6, .LBB11_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    sraiw a3, a4, 31
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    mv a1, a3
+; RV64I-NEXT:    j .LBB11_3
+; RV64I-NEXT:  .LBB11_2:
+; RV64I-NEXT:    lbu a4, 1(a0)
+; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a7, 2(a0)
+; RV64I-NEXT:    lbu t0, 3(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a4, a6, a4
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    srl a0, a0, a5
+; RV64I-NEXT:    not a4, a5
+; RV64I-NEXT:    slli a3, a3, 1
+; RV64I-NEXT:    sll a3, a3, a4
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:  .LBB11_3:
+; RV64I-NEXT:    sb a1, 8(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 15(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 14(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 13(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 12(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 11(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 10(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 9(a2)
+; RV64I-NEXT:    sb a0, 0(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 5(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 4(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 1(a2)
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: ashr_16bytes_wordOff:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a7, a0, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    srai a0, a0, 31
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw a0, 24(sp)
+; RV32I-NEXT:    sw a0, 20(sp)
+; RV32I-NEXT:    sw a0, 16(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    andi a1, a1, 12
+; RV32I-NEXT:    mv a0, sp
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    lw a1, 8(a0)
+; RV32I-NEXT:    lw a3, 12(a0)
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a0, 4(a0)
+; RV32I-NEXT:    sb a1, 8(a2)
+; RV32I-NEXT:    sb a3, 12(a2)
+; RV32I-NEXT:    sb a4, 0(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli a5, a1, 16
+; RV32I-NEXT:    sb a5, 10(a2)
+; RV32I-NEXT:    srli a5, a1, 24
+; RV32I-NEXT:    sb a5, 11(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 9(a2)
+; RV32I-NEXT:    srli a1, a3, 16
+; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    srli a1, a3, 24
+; RV32I-NEXT:    sb a1, 15(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    srli a1, a4, 16
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    srli a1, a4, 24
+; RV32I-NEXT:    sb a1, 3(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 1(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+  %src = load i128, ptr %src.ptr, align 1
+  %wordOff = load i128, ptr %wordOff.ptr, align 1
+  %bitOff = shl i128 %wordOff, 5
+  %res = ashr i128 %src, %bitOff
+  store i128 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_32bytes:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -64
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a5, a5, 32
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t1, 3(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    sd zero, 56(sp)
+; RV64I-NEXT:    sd zero, 48(sp)
+; RV64I-NEXT:    sd zero, 40(sp)
+; RV64I-NEXT:    sd zero, 32(sp)
+; RV64I-NEXT:    sd a0, 24(sp)
+; RV64I-NEXT:    sd a5, 16(sp)
+; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    andi a0, a1, 24
+; RV64I-NEXT:    mv a3, sp
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    srl a5, a4, a1
+; RV64I-NEXT:    ld a6, 16(a3)
+; RV64I-NEXT:    andi a0, a1, 56
+; RV64I-NEXT:    xori a7, a0, 63
+; RV64I-NEXT:    ld t0, 0(a3)
+; RV64I-NEXT:    slli a0, a6, 1
+; RV64I-NEXT:    sll a0, a0, a7
+; RV64I-NEXT:    or a0, a5, a0
+; RV64I-NEXT:    srl t0, t0, a1
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    ld a3, 24(a3)
+; RV64I-NEXT:    sll a4, a4, a7
+; RV64I-NEXT:    or a4, t0, a4
+; RV64I-NEXT:    srl a6, a6, a1
+; RV64I-NEXT:    slli t1, a3, 1
+; RV64I-NEXT:    sll a7, t1, a7
+; RV64I-NEXT:    or a7, a6, a7
+; RV64I-NEXT:    srl a1, a3, a1
+; RV64I-NEXT:    sb a6, 16(a2)
+; RV64I-NEXT:    sb a1, 24(a2)
+; RV64I-NEXT:    sb t0, 0(a2)
+; RV64I-NEXT:    sb a5, 8(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 31(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 30(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 29(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 28(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 27(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 26(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 25(a2)
+; RV64I-NEXT:    srli a1, a7, 56
+; RV64I-NEXT:    sb a1, 23(a2)
+; RV64I-NEXT:    srli a1, a7, 48
+; RV64I-NEXT:    sb a1, 22(a2)
+; RV64I-NEXT:    srli a1, a7, 40
+; RV64I-NEXT:    sb a1, 21(a2)
+; RV64I-NEXT:    srli a1, a7, 32
+; RV64I-NEXT:    sb a1, 20(a2)
+; RV64I-NEXT:    srli a1, a7, 24
+; RV64I-NEXT:    sb a1, 19(a2)
+; RV64I-NEXT:    srli a1, a7, 16
+; RV64I-NEXT:    sb a1, 18(a2)
+; RV64I-NEXT:    srli a1, a7, 8
+; RV64I-NEXT:    sb a1, 17(a2)
+; RV64I-NEXT:    srli a1, a4, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a1, a4, 48
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    srli a1, a4, 40
+; RV64I-NEXT:    sb a1, 5(a2)
+; RV64I-NEXT:    srli a1, a4, 32
+; RV64I-NEXT:    sb a1, 4(a2)
+; RV64I-NEXT:    srli a1, a4, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a1, a4, 16
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 1(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    addi sp, sp, 64
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: lshr_32bytes:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -80
+; RV32I-NEXT:    sw s0, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    lbu t2, 1(a1)
+; RV32I-NEXT:    lbu t3, 0(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    or a1, a1, t2
+; RV32I-NEXT:    sw zero, 60(sp)
+; RV32I-NEXT:    sw zero, 56(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 44(sp)
+; RV32I-NEXT:    sw zero, 40(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw t1, 24(sp)
+; RV32I-NEXT:    sw t0, 20(sp)
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    andi a0, a1, 28
+; RV32I-NEXT:    mv a3, sp
+; RV32I-NEXT:    add a5, a3, a0
+; RV32I-NEXT:    lw a3, 4(a5)
+; RV32I-NEXT:    slli a6, a1, 3
+; RV32I-NEXT:    srl a4, a3, a6
+; RV32I-NEXT:    lw a7, 8(a5)
+; RV32I-NEXT:    andi a0, a6, 24
+; RV32I-NEXT:    xori t0, a0, 31
+; RV32I-NEXT:    lw a1, 0(a5)
+; RV32I-NEXT:    slli a0, a7, 1
+; RV32I-NEXT:    sll a0, a0, t0
+; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    srl t1, a1, a6
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    lw t2, 12(a5)
+; RV32I-NEXT:    lw t3, 16(a5)
+; RV32I-NEXT:    sll a1, a3, t0
+; RV32I-NEXT:    or a1, t1, a1
+; RV32I-NEXT:    srl t4, t2, a6
+; RV32I-NEXT:    slli a3, t3, 1
+; RV32I-NEXT:    sll a3, a3, t0
+; RV32I-NEXT:    or a3, t4, a3
+; RV32I-NEXT:    srl a7, a7, a6
+; RV32I-NEXT:    slli t2, t2, 1
+; RV32I-NEXT:    lw t5, 20(a5)
+; RV32I-NEXT:    lw t6, 24(a5)
+; RV32I-NEXT:    sll t2, t2, t0
+; RV32I-NEXT:    or t2, a7, t2
+; RV32I-NEXT:    srl s0, t5, a6
+; RV32I-NEXT:    slli s1, t6, 1
+; RV32I-NEXT:    sll s1, s1, t0
+; RV32I-NEXT:    or s1, s0, s1
+; RV32I-NEXT:    srl t3, t3, a6
+; RV32I-NEXT:    slli t5, t5, 1
+; RV32I-NEXT:    lw a5, 28(a5)
+; RV32I-NEXT:    sll t5, t5, t0
+; RV32I-NEXT:    or t5, t3, t5
+; RV32I-NEXT:    srl t6, t6, a6
+; RV32I-NEXT:    slli s2, a5, 1
+; RV32I-NEXT:    sll t0, s2, t0
+; RV32I-NEXT:    or t0, t6, t0
+; RV32I-NEXT:    srl a5, a5, a6
+; RV32I-NEXT:    sb t6, 24(a2)
+; RV32I-NEXT:    sb a5, 28(a2)
+; RV32I-NEXT:    sb t3, 16(a2)
+; RV32I-NEXT:    sb s0, 20(a2)
+; RV32I-NEXT:    sb a7, 8(a2)
+; RV32I-NEXT:    sb t4, 12(a2)
+; RV32I-NEXT:    sb t1, 0(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli a4, a5, 24
+; RV32I-NEXT:    sb a4, 31(a2)
+; RV32I-NEXT:    srli a4, a5, 16
+; RV32I-NEXT:    sb a4, 30(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 29(a2)
+; RV32I-NEXT:    srli a4, t0, 24
+; RV32I-NEXT:    sb a4, 27(a2)
+; RV32I-NEXT:    srli a4, t0, 16
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    srli a4, t0, 8
+; RV32I-NEXT:    sb a4, 25(a2)
+; RV32I-NEXT:    srli a4, t5, 24
+; RV32I-NEXT:    sb a4, 19(a2)
+; RV32I-NEXT:    srli a4, t5, 16
+; RV32I-NEXT:    sb a4, 18(a2)
+; RV32I-NEXT:    srli a4, t5, 8
+; RV32I-NEXT:    sb a4, 17(a2)
+; RV32I-NEXT:    srli a4, s1, 24
+; RV32I-NEXT:    sb a4, 23(a2)
+; RV32I-NEXT:    srli a4, s1, 16
+; RV32I-NEXT:    sb a4, 22(a2)
+; RV32I-NEXT:    srli s1, s1, 8
+; RV32I-NEXT:    sb s1, 21(a2)
+; RV32I-NEXT:    srli a4, t2, 24
+; RV32I-NEXT:    sb a4, 11(a2)
+; RV32I-NEXT:    srli a4, t2, 16
+; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    srli a4, t2, 8
+; RV32I-NEXT:    sb a4, 9(a2)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 15(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 14(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    lw s0, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 80
+; RV32I-NEXT:    ret
+  %src = load i256, ptr %src.ptr, align 1
+  %byteOff = load i256, ptr %byteOff.ptr, align 1
+  %bitOff = shl i256 %byteOff, 3
+  %res = lshr i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_32bytes_wordOff:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -64
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a5, a5, 32
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t1, 3(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    sd zero, 56(sp)
+; RV64I-NEXT:    sd zero, 48(sp)
+; RV64I-NEXT:    sd zero, 40(sp)
+; RV64I-NEXT:    sd zero, 32(sp)
+; RV64I-NEXT:    sd a0, 24(sp)
+; RV64I-NEXT:    sd a5, 16(sp)
+; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    slli a0, a1, 2
+; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    mv a3, sp
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    slli a5, a1, 5
+; RV64I-NEXT:    srl a1, a4, a5
+; RV64I-NEXT:    ld a6, 16(a3)
+; RV64I-NEXT:    andi a0, a5, 32
+; RV64I-NEXT:    xori a7, a0, 63
+; RV64I-NEXT:    ld t0, 0(a3)
+; RV64I-NEXT:    slli a0, a6, 1
+; RV64I-NEXT:    sll a0, a0, a7
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    srl t0, t0, a5
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    ld a3, 24(a3)
+; RV64I-NEXT:    sll a4, a4, a7
+; RV64I-NEXT:    or a4, t0, a4
+; RV64I-NEXT:    srl a6, a6, a5
+; RV64I-NEXT:    slli t1, a3, 1
+; RV64I-NEXT:    sll a7, t1, a7
+; RV64I-NEXT:    or a7, a6, a7
+; RV64I-NEXT:    srl a3, a3, a5
+; RV64I-NEXT:    sb a6, 16(a2)
+; RV64I-NEXT:    sb a3, 24(a2)
+; RV64I-NEXT:    sb t0, 0(a2)
+; RV64I-NEXT:    sb a1, 8(a2)
+; RV64I-NEXT:    srli a5, a6, 24
+; RV64I-NEXT:    sb a5, 19(a2)
+; RV64I-NEXT:    srli a5, a6, 16
+; RV64I-NEXT:    sb a5, 18(a2)
+; RV64I-NEXT:    srli a5, a6, 8
+; RV64I-NEXT:    sb a5, 17(a2)
+; RV64I-NEXT:    srli a5, a3, 56
+; RV64I-NEXT:    sb a5, 31(a2)
+; RV64I-NEXT:    srli a5, a3, 48
+; RV64I-NEXT:    sb a5, 30(a2)
+; RV64I-NEXT:    srli a5, a3, 40
+; RV64I-NEXT:    sb a5, 29(a2)
+; RV64I-NEXT:    srli a5, a3, 32
+; RV64I-NEXT:    sb a5, 28(a2)
+; RV64I-NEXT:    srli a5, a3, 24
+; RV64I-NEXT:    sb a5, 27(a2)
+; RV64I-NEXT:    srli a5, a3, 16
+; RV64I-NEXT:    sb a5, 26(a2)
+; RV64I-NEXT:    srli a3, a3, 8
+; RV64I-NEXT:    sb a3, 25(a2)
+; RV64I-NEXT:    srli a3, t0, 24
+; RV64I-NEXT:    sb a3, 3(a2)
+; RV64I-NEXT:    srli a3, t0, 16
+; RV64I-NEXT:    sb a3, 2(a2)
+; RV64I-NEXT:    srli a3, t0, 8
+; RV64I-NEXT:    sb a3, 1(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 11(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 10(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 9(a2)
+; RV64I-NEXT:    srli a1, a7, 56
+; RV64I-NEXT:    sb a1, 23(a2)
+; RV64I-NEXT:    srli a1, a7, 48
+; RV64I-NEXT:    sb a1, 22(a2)
+; RV64I-NEXT:    srli a1, a7, 40
+; RV64I-NEXT:    sb a1, 21(a2)
+; RV64I-NEXT:    srli a1, a7, 32
+; RV64I-NEXT:    sb a1, 20(a2)
+; RV64I-NEXT:    srli a1, a4, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a1, a4, 48
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    srli a1, a4, 40
+; RV64I-NEXT:    sb a1, 5(a2)
+; RV64I-NEXT:    srli a4, a4, 32
+; RV64I-NEXT:    sb a4, 4(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    sb a0, 12(a2)
+; RV64I-NEXT:    addi sp, sp, 64
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: lshr_32bytes_wordOff:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -64
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    sw zero, 60(sp)
+; RV32I-NEXT:    sw zero, 56(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 44(sp)
+; RV32I-NEXT:    sw zero, 40(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw t1, 24(sp)
+; RV32I-NEXT:    sw t0, 20(sp)
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    andi a1, a1, 28
+; RV32I-NEXT:    mv a0, sp
+; RV32I-NEXT:    add a3, a0, a1
+; RV32I-NEXT:    lw a0, 4(a3)
+; RV32I-NEXT:    lw a1, 0(a3)
+; RV32I-NEXT:    lw a4, 12(a3)
+; RV32I-NEXT:    lw a5, 8(a3)
+; RV32I-NEXT:    lw a6, 24(a3)
+; RV32I-NEXT:    lw a7, 28(a3)
+; RV32I-NEXT:    lw t0, 16(a3)
+; RV32I-NEXT:    lw a3, 20(a3)
+; RV32I-NEXT:    sb a6, 24(a2)
+; RV32I-NEXT:    sb a7, 28(a2)
+; RV32I-NEXT:    sb t0, 16(a2)
+; RV32I-NEXT:    sb a3, 20(a2)
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a4, 12(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli t1, a6, 24
+; RV32I-NEXT:    sb t1, 27(a2)
+; RV32I-NEXT:    srli t1, a6, 16
+; RV32I-NEXT:    sb t1, 26(a2)
+; RV32I-NEXT:    srli a6, a6, 8
+; RV32I-NEXT:    sb a6, 25(a2)
+; RV32I-NEXT:    srli a6, a7, 24
+; RV32I-NEXT:    sb a6, 31(a2)
+; RV32I-NEXT:    srli a6, a7, 16
+; RV32I-NEXT:    sb a6, 30(a2)
+; RV32I-NEXT:    srli a6, a7, 8
+; RV32I-NEXT:    sb a6, 29(a2)
+; RV32I-NEXT:    srli a6, t0, 24
+; RV32I-NEXT:    sb a6, 19(a2)
+; RV32I-NEXT:    srli a6, t0, 16
+; RV32I-NEXT:    sb a6, 18(a2)
+; RV32I-NEXT:    srli a6, t0, 8
+; RV32I-NEXT:    sb a6, 17(a2)
+; RV32I-NEXT:    srli a6, a3, 24
+; RV32I-NEXT:    sb a6, 23(a2)
+; RV32I-NEXT:    srli a6, a3, 16
+; RV32I-NEXT:    sb a6, 22(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 21(a2)
+; RV32I-NEXT:    srli a3, a5, 24
+; RV32I-NEXT:    sb a3, 11(a2)
+; RV32I-NEXT:    srli a3, a5, 16
+; RV32I-NEXT:    sb a3, 10(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a3, a4, 24
+; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    srli a3, a4, 16
+; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 13(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    ret
+  %src = load i256, ptr %src.ptr, align 1
+  %wordOff = load i256, ptr %wordOff.ptr, align 1
+  %bitOff = shl i256 %wordOff, 5
+  %res = lshr i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_32bytes_dwordOff:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -64
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a5, a5, 32
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a1, 0(a1)
+; RV64I-NEXT:    sd zero, 56(sp)
+; RV64I-NEXT:    sd zero, 48(sp)
+; RV64I-NEXT:    sd zero, 40(sp)
+; RV64I-NEXT:    sd zero, 32(sp)
+; RV64I-NEXT:    sd a0, 24(sp)
+; RV64I-NEXT:    sd a5, 16(sp)
+; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    andi a1, a1, 24
+; RV64I-NEXT:    mv a0, sp
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a1, 16(a0)
+; RV64I-NEXT:    ld a3, 24(a0)
+; RV64I-NEXT:    ld a4, 0(a0)
+; RV64I-NEXT:    ld a0, 8(a0)
+; RV64I-NEXT:    sb a1, 16(a2)
+; RV64I-NEXT:    sb a3, 24(a2)
+; RV64I-NEXT:    sb a4, 0(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a5, a1, 56
+; RV64I-NEXT:    sb a5, 23(a2)
+; RV64I-NEXT:    srli a5, a1, 48
+; RV64I-NEXT:    sb a5, 22(a2)
+; RV64I-NEXT:    srli a5, a1, 40
+; RV64I-NEXT:    sb a5, 21(a2)
+; RV64I-NEXT:    srli a5, a1, 32
+; RV64I-NEXT:    sb a5, 20(a2)
+; RV64I-NEXT:    srli a5, a1, 24
+; RV64I-NEXT:    sb a5, 19(a2)
+; RV64I-NEXT:    srli a5, a1, 16
+; RV64I-NEXT:    sb a5, 18(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 17(a2)
+; RV64I-NEXT:    srli a1, a3, 56
+; RV64I-NEXT:    sb a1, 31(a2)
+; RV64I-NEXT:    srli a1, a3, 48
+; RV64I-NEXT:    sb a1, 30(a2)
+; RV64I-NEXT:    srli a1, a3, 40
+; RV64I-NEXT:    sb a1, 29(a2)
+; RV64I-NEXT:    srli a1, a3, 32
+; RV64I-NEXT:    sb a1, 28(a2)
+; RV64I-NEXT:    srli a1, a3, 24
+; RV64I-NEXT:    sb a1, 27(a2)
+; RV64I-NEXT:    srli a1, a3, 16
+; RV64I-NEXT:    sb a1, 26(a2)
+; RV64I-NEXT:    srli a3, a3, 8
+; RV64I-NEXT:    sb a3, 25(a2)
+; RV64I-NEXT:    srli a1, a4, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a1, a4, 48
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    srli a1, a4, 40
+; RV64I-NEXT:    sb a1, 5(a2)
+; RV64I-NEXT:    srli a1, a4, 32
+; RV64I-NEXT:    sb a1, 4(a2)
+; RV64I-NEXT:    srli a1, a4, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a1, a4, 16
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 1(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    addi sp, sp, 64
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: lshr_32bytes_dwordOff:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -64
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    sw zero, 60(sp)
+; RV32I-NEXT:    sw zero, 56(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 44(sp)
+; RV32I-NEXT:    sw zero, 40(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw t1, 24(sp)
+; RV32I-NEXT:    sw t0, 20(sp)
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    andi a1, a1, 24
+; RV32I-NEXT:    mv a0, sp
+; RV32I-NEXT:    add a3, a0, a1
+; RV32I-NEXT:    lw a0, 4(a3)
+; RV32I-NEXT:    lw a1, 0(a3)
+; RV32I-NEXT:    lw a4, 12(a3)
+; RV32I-NEXT:    lw a5, 8(a3)
+; RV32I-NEXT:    lw a6, 24(a3)
+; RV32I-NEXT:    lw a7, 28(a3)
+; RV32I-NEXT:    lw t0, 16(a3)
+; RV32I-NEXT:    lw a3, 20(a3)
+; RV32I-NEXT:    sb a6, 24(a2)
+; RV32I-NEXT:    sb a7, 28(a2)
+; RV32I-NEXT:    sb t0, 16(a2)
+; RV32I-NEXT:    sb a3, 20(a2)
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a4, 12(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli t1, a6, 24
+; RV32I-NEXT:    sb t1, 27(a2)
+; RV32I-NEXT:    srli t1, a6, 16
+; RV32I-NEXT:    sb t1, 26(a2)
+; RV32I-NEXT:    srli a6, a6, 8
+; RV32I-NEXT:    sb a6, 25(a2)
+; RV32I-NEXT:    srli a6, a7, 24
+; RV32I-NEXT:    sb a6, 31(a2)
+; RV32I-NEXT:    srli a6, a7, 16
+; RV32I-NEXT:    sb a6, 30(a2)
+; RV32I-NEXT:    srli a6, a7, 8
+; RV32I-NEXT:    sb a6, 29(a2)
+; RV32I-NEXT:    srli a6, t0, 24
+; RV32I-NEXT:    sb a6, 19(a2)
+; RV32I-NEXT:    srli a6, t0, 16
+; RV32I-NEXT:    sb a6, 18(a2)
+; RV32I-NEXT:    srli a6, t0, 8
+; RV32I-NEXT:    sb a6, 17(a2)
+; RV32I-NEXT:    srli a6, a3, 24
+; RV32I-NEXT:    sb a6, 23(a2)
+; RV32I-NEXT:    srli a6, a3, 16
+; RV32I-NEXT:    sb a6, 22(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 21(a2)
+; RV32I-NEXT:    srli a3, a5, 24
+; RV32I-NEXT:    sb a3, 11(a2)
+; RV32I-NEXT:    srli a3, a5, 16
+; RV32I-NEXT:    sb a3, 10(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a3, a4, 24
+; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    srli a3, a4, 16
+; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 13(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    ret
+  %src = load i256, ptr %src.ptr, align 1
+  %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+  %bitOff = shl i256 %dwordOff, 6
+  %res = lshr i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_32bytes:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -64
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a5, a5, 32
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t1, 3(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    sd zero, 24(sp)
+; RV64I-NEXT:    sd zero, 16(sp)
+; RV64I-NEXT:    sd zero, 8(sp)
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd a0, 56(sp)
+; RV64I-NEXT:    sd a5, 48(sp)
+; RV64I-NEXT:    sd a4, 40(sp)
+; RV64I-NEXT:    sd a3, 32(sp)
+; RV64I-NEXT:    andi a0, a1, 24
+; RV64I-NEXT:    addi a3, sp, 32
+; RV64I-NEXT:    sub a3, a3, a0
+; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    ld a5, 0(a3)
+; RV64I-NEXT:    sll a6, a4, a1
+; RV64I-NEXT:    andi a0, a1, 56
+; RV64I-NEXT:    xori a7, a0, 63
+; RV64I-NEXT:    srli a0, a5, 1
+; RV64I-NEXT:    ld t0, 24(a3)
+; RV64I-NEXT:    ld a3, 16(a3)
+; RV64I-NEXT:    srl a0, a0, a7
+; RV64I-NEXT:    or a0, a6, a0
+; RV64I-NEXT:    sll t0, t0, a1
+; RV64I-NEXT:    srli t1, a3, 1
+; RV64I-NEXT:    srl t1, t1, a7
+; RV64I-NEXT:    or t1, t0, t1
+; RV64I-NEXT:    sll a3, a3, a1
+; RV64I-NEXT:    srli a4, a4, 1
+; RV64I-NEXT:    srl a4, a4, a7
+; RV64I-NEXT:    or a4, a3, a4
+; RV64I-NEXT:    sll a1, a5, a1
+; RV64I-NEXT:    sb a1, 0(a2)
+; RV64I-NEXT:    srli a3, a3, 56
+; RV64I-NEXT:    sb a3, 23(a2)
+; RV64I-NEXT:    srli a3, t0, 56
+; RV64I-NEXT:    sb a3, 31(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 7(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 6(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 5(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 4(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 3(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 2(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 1(a2)
+; RV64I-NEXT:    srli a1, a6, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    sb t1, 24(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a1, a4, 48
+; RV64I-NEXT:    sb a1, 22(a2)
+; RV64I-NEXT:    srli a1, a4, 40
+; RV64I-NEXT:    sb a1, 21(a2)
+; RV64I-NEXT:    srli a1, a4, 32
+; RV64I-NEXT:    sb a1, 20(a2)
+; RV64I-NEXT:    srli a1, a4, 24
+; RV64I-NEXT:    sb a1, 19(a2)
+; RV64I-NEXT:    srli a1, a4, 16
+; RV64I-NEXT:    sb a1, 18(a2)
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 17(a2)
+; RV64I-NEXT:    srli a1, t1, 48
+; RV64I-NEXT:    sb a1, 30(a2)
+; RV64I-NEXT:    srli a1, t1, 40
+; RV64I-NEXT:    sb a1, 29(a2)
+; RV64I-NEXT:    srli a1, t1, 32
+; RV64I-NEXT:    sb a1, 28(a2)
+; RV64I-NEXT:    srli a1, t1, 24
+; RV64I-NEXT:    sb a1, 27(a2)
+; RV64I-NEXT:    srli a1, t1, 16
+; RV64I-NEXT:    sb a1, 26(a2)
+; RV64I-NEXT:    srli a1, t1, 8
+; RV64I-NEXT:    sb a1, 25(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    addi sp, sp, 64
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: shl_32bytes:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -80
+; RV32I-NEXT:    sw s0, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    lbu t2, 1(a1)
+; RV32I-NEXT:    lbu t3, 0(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    or a1, a1, t2
+; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw a0, 60(sp)
+; RV32I-NEXT:    sw t1, 56(sp)
+; RV32I-NEXT:    sw t0, 52(sp)
+; RV32I-NEXT:    sw a7, 48(sp)
+; RV32I-NEXT:    sw a6, 44(sp)
+; RV32I-NEXT:    sw a5, 40(sp)
+; RV32I-NEXT:    sw a4, 36(sp)
+; RV32I-NEXT:    sw a3, 32(sp)
+; RV32I-NEXT:    andi a0, a1, 28
+; RV32I-NEXT:    addi a3, sp, 32
+; RV32I-NEXT:    sub a6, a3, a0
+; RV32I-NEXT:    lw a3, 4(a6)
+; RV32I-NEXT:    slli a7, a1, 3
+; RV32I-NEXT:    lw t0, 0(a6)
+; RV32I-NEXT:    sll a4, a3, a7
+; RV32I-NEXT:    andi a0, a7, 24
+; RV32I-NEXT:    xori t1, a0, 31
+; RV32I-NEXT:    srli a0, t0, 1
+; RV32I-NEXT:    lw t2, 12(a6)
+; RV32I-NEXT:    lw a5, 8(a6)
+; RV32I-NEXT:    srl a0, a0, t1
+; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    sll t3, t2, a7
+; RV32I-NEXT:    srli a1, a5, 1
+; RV32I-NEXT:    srl a1, a1, t1
+; RV32I-NEXT:    or a1, t3, a1
+; RV32I-NEXT:    sll t4, a5, a7
+; RV32I-NEXT:    srli a3, a3, 1
+; RV32I-NEXT:    lw t5, 20(a6)
+; RV32I-NEXT:    lw t6, 16(a6)
+; RV32I-NEXT:    srl a3, a3, t1
+; RV32I-NEXT:    or a3, t4, a3
+; RV32I-NEXT:    sll s0, t5, a7
+; RV32I-NEXT:    srli a5, t6, 1
+; RV32I-NEXT:    srl a5, a5, t1
+; RV32I-NEXT:    or a5, s0, a5
+; RV32I-NEXT:    sll t6, t6, a7
+; RV32I-NEXT:    srli t2, t2, 1
+; RV32I-NEXT:    lw s1, 28(a6)
+; RV32I-NEXT:    lw a6, 24(a6)
+; RV32I-NEXT:    srl t2, t2, t1
+; RV32I-NEXT:    or t2, t6, t2
+; RV32I-NEXT:    sll s1, s1, a7
+; RV32I-NEXT:    srli s2, a6, 1
+; RV32I-NEXT:    srl s2, s2, t1
+; RV32I-NEXT:    or s2, s1, s2
+; RV32I-NEXT:    sll a6, a6, a7
+; RV32I-NEXT:    srli t5, t5, 1
+; RV32I-NEXT:    srl t1, t5, t1
+; RV32I-NEXT:    or t1, a6, t1
+; RV32I-NEXT:    sll a7, t0, a7
+; RV32I-NEXT:    sb a7, 0(a2)
+; RV32I-NEXT:    srli a6, a6, 24
+; RV32I-NEXT:    sb a6, 27(a2)
+; RV32I-NEXT:    srli s1, s1, 24
+; RV32I-NEXT:    sb s1, 31(a2)
+; RV32I-NEXT:    srli a6, t6, 24
+; RV32I-NEXT:    sb a6, 19(a2)
+; RV32I-NEXT:    srli s0, s0, 24
+; RV32I-NEXT:    sb s0, 23(a2)
+; RV32I-NEXT:    srli a6, t4, 24
+; RV32I-NEXT:    sb a6, 11(a2)
+; RV32I-NEXT:    srli a6, t3, 24
+; RV32I-NEXT:    sb a6, 15(a2)
+; RV32I-NEXT:    srli a6, a7, 24
+; RV32I-NEXT:    sb a6, 3(a2)
+; RV32I-NEXT:    srli a6, a7, 16
+; RV32I-NEXT:    sb a6, 2(a2)
+; RV32I-NEXT:    srli a6, a7, 8
+; RV32I-NEXT:    sb a6, 1(a2)
+; RV32I-NEXT:    srli a4, a4, 24
+; RV32I-NEXT:    sb a4, 7(a2)
+; RV32I-NEXT:    sb t1, 24(a2)
+; RV32I-NEXT:    sb s2, 28(a2)
+; RV32I-NEXT:    sb t2, 16(a2)
+; RV32I-NEXT:    sb a5, 20(a2)
+; RV32I-NEXT:    sb a3, 8(a2)
+; RV32I-NEXT:    sb a1, 12(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli a4, t1, 16
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    srli a4, t1, 8
+; RV32I-NEXT:    sb a4, 25(a2)
+; RV32I-NEXT:    srli a4, s2, 16
+; RV32I-NEXT:    sb a4, 30(a2)
+; RV32I-NEXT:    srli a4, s2, 8
+; RV32I-NEXT:    sb a4, 29(a2)
+; RV32I-NEXT:    srli a4, t2, 16
+; RV32I-NEXT:    sb a4, 18(a2)
+; RV32I-NEXT:    srli a4, t2, 8
+; RV32I-NEXT:    sb a4, 17(a2)
+; RV32I-NEXT:    srli a4, a5, 16
+; RV32I-NEXT:    sb a4, 22(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 21(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 9(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 13(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    lw s0, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 80
+; RV32I-NEXT:    ret
+  %src = load i256, ptr %src.ptr, align 1
+  %byteOff = load i256, ptr %byteOff.ptr, align 1
+  %bitOff = shl i256 %byteOff, 3
+  %res = shl i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_32bytes_wordOff:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -64
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a5, a5, 32
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t1, 3(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    sd zero, 24(sp)
+; RV64I-NEXT:    sd zero, 16(sp)
+; RV64I-NEXT:    sd zero, 8(sp)
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd a0, 56(sp)
+; RV64I-NEXT:    sd a5, 48(sp)
+; RV64I-NEXT:    sd a4, 40(sp)
+; RV64I-NEXT:    sd a3, 32(sp)
+; RV64I-NEXT:    slli a0, a1, 2
+; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    addi a3, sp, 32
+; RV64I-NEXT:    sub a0, a3, a0
+; RV64I-NEXT:    ld a4, 8(a0)
+; RV64I-NEXT:    slli a5, a1, 5
+; RV64I-NEXT:    ld a6, 0(a0)
+; RV64I-NEXT:    sll a3, a4, a5
+; RV64I-NEXT:    andi a1, a5, 32
+; RV64I-NEXT:    xori a7, a1, 63
+; RV64I-NEXT:    srli a1, a6, 1
+; RV64I-NEXT:    ld t0, 24(a0)
+; RV64I-NEXT:    ld t1, 16(a0)
+; RV64I-NEXT:    srl a0, a1, a7
+; RV64I-NEXT:    or a0, a3, a0
+; RV64I-NEXT:    sll t0, t0, a5
+; RV64I-NEXT:    srli a1, t1, 1
+; RV64I-NEXT:    srl a1, a1, a7
+; RV64I-NEXT:    or a1, t0, a1
+; RV64I-NEXT:    sll t1, t1, a5
+; RV64I-NEXT:    srli a4, a4, 1
+; RV64I-NEXT:    srl a4, a4, a7
+; RV64I-NEXT:    or a4, t1, a4
+; RV64I-NEXT:    sll a5, a6, a5
+; RV64I-NEXT:    sb a5, 0(a2)
+; RV64I-NEXT:    srli a6, t1, 56
+; RV64I-NEXT:    sb a6, 23(a2)
+; RV64I-NEXT:    srli a6, t1, 48
+; RV64I-NEXT:    sb a6, 22(a2)
+; RV64I-NEXT:    srli a6, t1, 40
+; RV64I-NEXT:    sb a6, 21(a2)
+; RV64I-NEXT:    srli a6, t1, 32
+; RV64I-NEXT:    sb a6, 20(a2)
+; RV64I-NEXT:    srli a6, t0, 56
+; RV64I-NEXT:    sb a6, 31(a2)
+; RV64I-NEXT:    srli a6, t0, 48
+; RV64I-NEXT:    sb a6, 30(a2)
+; RV64I-NEXT:    srli a6, t0, 40
+; RV64I-NEXT:    sb a6, 29(a2)
+; RV64I-NEXT:    srli a6, t0, 32
+; RV64I-NEXT:    sb a6, 28(a2)
+; RV64I-NEXT:    srli a6, a5, 56
+; RV64I-NEXT:    sb a6, 7(a2)
+; RV64I-NEXT:    srli a6, a5, 48
+; RV64I-NEXT:    sb a6, 6(a2)
+; RV64I-NEXT:    srli a6, a5, 40
+; RV64I-NEXT:    sb a6, 5(a2)
+; RV64I-NEXT:    srli a6, a5, 32
+; RV64I-NEXT:    sb a6, 4(a2)
+; RV64I-NEXT:    srli a6, a5, 24
+; RV64I-NEXT:    sb a6, 3(a2)
+; RV64I-NEXT:    srli a6, a5, 16
+; RV64I-NEXT:    sb a6, 2(a2)
+; RV64I-NEXT:    srli a5, a5, 8
+; RV64I-NEXT:    sb a5, 1(a2)
+; RV64I-NEXT:    srli a5, a3, 56
+; RV64I-NEXT:    sb a5, 15(a2)
+; RV64I-NEXT:    srli a5, a3, 48
+; RV64I-NEXT:    sb a5, 14(a2)
+; RV64I-NEXT:    srli a5, a3, 40
+; RV64I-NEXT:    sb a5, 13(a2)
+; RV64I-NEXT:    srli a3, a3, 32
+; RV64I-NEXT:    sb a3, 12(a2)
+; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    sb a1, 24(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a3, a4, 24
+; RV64I-NEXT:    sb a3, 19(a2)
+; RV64I-NEXT:    srli a3, a4, 16
+; RV64I-NEXT:    sb a3, 18(a2)
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 17(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 27(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 26(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 25(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    addi sp, sp, 64
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: shl_32bytes_wordOff:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -64
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw a0, 60(sp)
+; RV32I-NEXT:    sw t1, 56(sp)
+; RV32I-NEXT:    sw t0, 52(sp)
+; RV32I-NEXT:    sw a7, 48(sp)
+; RV32I-NEXT:    sw a6, 44(sp)
+; RV32I-NEXT:    sw a5, 40(sp)
+; RV32I-NEXT:    sw a4, 36(sp)
+; RV32I-NEXT:    sw a3, 32(sp)
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    andi a1, a1, 28
+; RV32I-NEXT:    addi a0, sp, 32
+; RV32I-NEXT:    sub a3, a0, a1
+; RV32I-NEXT:    lw a0, 4(a3)
+; RV32I-NEXT:    lw a1, 0(a3)
+; RV32I-NEXT:    lw a4, 12(a3)
+; RV32I-NEXT:    lw a5, 8(a3)
+; RV32I-NEXT:    lw a6, 24(a3)
+; RV32I-NEXT:    lw a7, 28(a3)
+; RV32I-NEXT:    lw t0, 16(a3)
+; RV32I-NEXT:    lw a3, 20(a3)
+; RV32I-NEXT:    sb a6, 24(a2)
+; RV32I-NEXT:    sb a7, 28(a2)
+; RV32I-NEXT:    sb t0, 16(a2)
+; RV32I-NEXT:    sb a3, 20(a2)
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a4, 12(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli t1, a6, 24
+; RV32I-NEXT:    sb t1, 27(a2)
+; RV32I-NEXT:    srli t1, a6, 16
+; RV32I-NEXT:    sb t1, 26(a2)
+; RV32I-NEXT:    srli a6, a6, 8
+; RV32I-NEXT:    sb a6, 25(a2)
+; RV32I-NEXT:    srli a6, a7, 24
+; RV32I-NEXT:    sb a6, 31(a2)
+; RV32I-NEXT:    srli a6, a7, 16
+; RV32I-NEXT:    sb a6, 30(a2)
+; RV32I-NEXT:    srli a6, a7, 8
+; RV32I-NEXT:    sb a6, 29(a2)
+; RV32I-NEXT:    srli a6, t0, 24
+; RV32I-NEXT:    sb a6, 19(a2)
+; RV32I-NEXT:    srli a6, t0, 16
+; RV32I-NEXT:    sb a6, 18(a2)
+; RV32I-NEXT:    srli a6, t0, 8
+; RV32I-NEXT:    sb a6, 17(a2)
+; RV32I-NEXT:    srli a6, a3, 24
+; RV32I-NEXT:    sb a6, 23(a2)
+; RV32I-NEXT:    srli a6, a3, 16
+; RV32I-NEXT:    sb a6, 22(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 21(a2)
+; RV32I-NEXT:    srli a3, a5, 24
+; RV32I-NEXT:    sb a3, 11(a2)
+; RV32I-NEXT:    srli a3, a5, 16
+; RV32I-NEXT:    sb a3, 10(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a3, a4, 24
+; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    srli a3, a4, 16
+; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 13(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    ret
+  %src = load i256, ptr %src.ptr, align 1
+  %wordOff = load i256, ptr %wordOff.ptr, align 1
+  %bitOff = shl i256 %wordOff, 5
+  %res = shl i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_32bytes_dwordOff:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -64
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a5, a5, 32
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a1, 0(a1)
+; RV64I-NEXT:    sd zero, 24(sp)
+; RV64I-NEXT:    sd zero, 16(sp)
+; RV64I-NEXT:    sd zero, 8(sp)
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd a0, 56(sp)
+; RV64I-NEXT:    sd a5, 48(sp)
+; RV64I-NEXT:    sd a4, 40(sp)
+; RV64I-NEXT:    sd a3, 32(sp)
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    andi a1, a1, 24
+; RV64I-NEXT:    addi a0, sp, 32
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ld a1, 16(a0)
+; RV64I-NEXT:    ld a3, 24(a0)
+; RV64I-NEXT:    ld a4, 0(a0)
+; RV64I-NEXT:    ld a0, 8(a0)
+; RV64I-NEXT:    sb a1, 16(a2)
+; RV64I-NEXT:    sb a3, 24(a2)
+; RV64I-NEXT:    sb a4, 0(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a5, a1, 56
+; RV64I-NEXT:    sb a5, 23(a2)
+; RV64I-NEXT:    srli a5, a1, 48
+; RV64I-NEXT:    sb a5, 22(a2)
+; RV64I-NEXT:    srli a5, a1, 40
+; RV64I-NEXT:    sb a5, 21(a2)
+; RV64I-NEXT:    srli a5, a1, 32
+; RV64I-NEXT:    sb a5, 20(a2)
+; RV64I-NEXT:    srli a5, a1, 24
+; RV64I-NEXT:    sb a5, 19(a2)
+; RV64I-NEXT:    srli a5, a1, 16
+; RV64I-NEXT:    sb a5, 18(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 17(a2)
+; RV64I-NEXT:    srli a1, a3, 56
+; RV64I-NEXT:    sb a1, 31(a2)
+; RV64I-NEXT:    srli a1, a3, 48
+; RV64I-NEXT:    sb a1, 30(a2)
+; RV64I-NEXT:    srli a1, a3, 40
+; RV64I-NEXT:    sb a1, 29(a2)
+; RV64I-NEXT:    srli a1, a3, 32
+; RV64I-NEXT:    sb a1, 28(a2)
+; RV64I-NEXT:    srli a1, a3, 24
+; RV64I-NEXT:    sb a1, 27(a2)
+; RV64I-NEXT:    srli a1, a3, 16
+; RV64I-NEXT:    sb a1, 26(a2)
+; RV64I-NEXT:    srli a3, a3, 8
+; RV64I-NEXT:    sb a3, 25(a2)
+; RV64I-NEXT:    srli a1, a4, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a1, a4, 48
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    srli a1, a4, 40
+; RV64I-NEXT:    sb a1, 5(a2)
+; RV64I-NEXT:    srli a1, a4, 32
+; RV64I-NEXT:    sb a1, 4(a2)
+; RV64I-NEXT:    srli a1, a4, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a1, a4, 16
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 1(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    addi sp, sp, 64
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: shl_32bytes_dwordOff:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -64
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw a0, 60(sp)
+; RV32I-NEXT:    sw t1, 56(sp)
+; RV32I-NEXT:    sw t0, 52(sp)
+; RV32I-NEXT:    sw a7, 48(sp)
+; RV32I-NEXT:    sw a6, 44(sp)
+; RV32I-NEXT:    sw a5, 40(sp)
+; RV32I-NEXT:    sw a4, 36(sp)
+; RV32I-NEXT:    sw a3, 32(sp)
+; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    andi a1, a1, 24
+; RV32I-NEXT:    addi a0, sp, 32
+; RV32I-NEXT:    sub a3, a0, a1
+; RV32I-NEXT:    lw a0, 4(a3)
+; RV32I-NEXT:    lw a1, 0(a3)
+; RV32I-NEXT:    lw a4, 12(a3)
+; RV32I-NEXT:    lw a5, 8(a3)
+; RV32I-NEXT:    lw a6, 24(a3)
+; RV32I-NEXT:    lw a7, 28(a3)
+; RV32I-NEXT:    lw t0, 16(a3)
+; RV32I-NEXT:    lw a3, 20(a3)
+; RV32I-NEXT:    sb a6, 24(a2)
+; RV32I-NEXT:    sb a7, 28(a2)
+; RV32I-NEXT:    sb t0, 16(a2)
+; RV32I-NEXT:    sb a3, 20(a2)
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a4, 12(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli t1, a6, 24
+; RV32I-NEXT:    sb t1, 27(a2)
+; RV32I-NEXT:    srli t1, a6, 16
+; RV32I-NEXT:    sb t1, 26(a2)
+; RV32I-NEXT:    srli a6, a6, 8
+; RV32I-NEXT:    sb a6, 25(a2)
+; RV32I-NEXT:    srli a6, a7, 24
+; RV32I-NEXT:    sb a6, 31(a2)
+; RV32I-NEXT:    srli a6, a7, 16
+; RV32I-NEXT:    sb a6, 30(a2)
+; RV32I-NEXT:    srli a6, a7, 8
+; RV32I-NEXT:    sb a6, 29(a2)
+; RV32I-NEXT:    srli a6, t0, 24
+; RV32I-NEXT:    sb a6, 19(a2)
+; RV32I-NEXT:    srli a6, t0, 16
+; RV32I-NEXT:    sb a6, 18(a2)
+; RV32I-NEXT:    srli a6, t0, 8
+; RV32I-NEXT:    sb a6, 17(a2)
+; RV32I-NEXT:    srli a6, a3, 24
+; RV32I-NEXT:    sb a6, 23(a2)
+; RV32I-NEXT:    srli a6, a3, 16
+; RV32I-NEXT:    sb a6, 22(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 21(a2)
+; RV32I-NEXT:    srli a3, a5, 24
+; RV32I-NEXT:    sb a3, 11(a2)
+; RV32I-NEXT:    srli a3, a5, 16
+; RV32I-NEXT:    sb a3, 10(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a3, a4, 24
+; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    srli a3, a4, 16
+; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 13(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    ret
+  %src = load i256, ptr %src.ptr, align 1
+  %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+  %bitOff = shl i256 %dwordOff, 6
+  %res = shl i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_32bytes:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -64
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a5, a5, 32
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a7, a0, 32
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 1(a1)
+; RV64I-NEXT:    lbu t0, 0(a1)
+; RV64I-NEXT:    lbu t1, 2(a1)
+; RV64I-NEXT:    lbu t2, 3(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or t0, t2, t1
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 4(a1)
+; RV64I-NEXT:    lbu t2, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or t0, t0, t1
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t2
+; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    sraiw a0, a0, 31
+; RV64I-NEXT:    sd a0, 56(sp)
+; RV64I-NEXT:    sd a0, 48(sp)
+; RV64I-NEXT:    sd a0, 40(sp)
+; RV64I-NEXT:    sd a0, 32(sp)
+; RV64I-NEXT:    sd a6, 24(sp)
+; RV64I-NEXT:    sd a5, 16(sp)
+; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    andi a0, a1, 24
+; RV64I-NEXT:    mv a3, sp
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    srl a5, a4, a1
+; RV64I-NEXT:    ld a6, 16(a3)
+; RV64I-NEXT:    andi a0, a1, 56
+; RV64I-NEXT:    xori a7, a0, 63
+; RV64I-NEXT:    ld t0, 0(a3)
+; RV64I-NEXT:    slli a0, a6, 1
+; RV64I-NEXT:    sll a0, a0, a7
+; RV64I-NEXT:    or a0, a5, a0
+; RV64I-NEXT:    srl t0, t0, a1
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    ld a3, 24(a3)
+; RV64I-NEXT:    sll a4, a4, a7
+; RV64I-NEXT:    or a4, t0, a4
+; RV64I-NEXT:    srl a6, a6, a1
+; RV64I-NEXT:    slli t1, a3, 1
+; RV64I-NEXT:    sll a7, t1, a7
+; RV64I-NEXT:    or a7, a6, a7
+; RV64I-NEXT:    sra a1, a3, a1
+; RV64I-NEXT:    sb a6, 16(a2)
+; RV64I-NEXT:    sb a1, 24(a2)
+; RV64I-NEXT:    sb t0, 0(a2)
+; RV64I-NEXT:    sb a5, 8(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 31(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 30(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 29(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 28(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 27(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 26(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 25(a2)
+; RV64I-NEXT:    srli a1, a7, 56
+; RV64I-NEXT:    sb a1, 23(a2)
+; RV64I-NEXT:    srli a1, a7, 48
+; RV64I-NEXT:    sb a1, 22(a2)
+; RV64I-NEXT:    srli a1, a7, 40
+; RV64I-NEXT:    sb a1, 21(a2)
+; RV64I-NEXT:    srli a1, a7, 32
+; RV64I-NEXT:    sb a1, 20(a2)
+; RV64I-NEXT:    srli a1, a7, 24
+; RV64I-NEXT:    sb a1, 19(a2)
+; RV64I-NEXT:    srli a1, a7, 16
+; RV64I-NEXT:    sb a1, 18(a2)
+; RV64I-NEXT:    srli a1, a7, 8
+; RV64I-NEXT:    sb a1, 17(a2)
+; RV64I-NEXT:    srli a1, a4, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a1, a4, 48
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    srli a1, a4, 40
+; RV64I-NEXT:    sb a1, 5(a2)
+; RV64I-NEXT:    srli a1, a4, 32
+; RV64I-NEXT:    sb a1, 4(a2)
+; RV64I-NEXT:    srli a1, a4, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a1, a4, 16
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 1(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    addi sp, sp, 64
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: ashr_32bytes:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -80
+; RV32I-NEXT:    sw s0, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or t3, a0, t4
+; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    lbu t3, 1(a1)
+; RV32I-NEXT:    lbu t4, 0(a1)
+; RV32I-NEXT:    lbu t5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    or t3, t3, t4
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t5
+; RV32I-NEXT:    or a1, a1, t3
+; RV32I-NEXT:    srai a0, a0, 31
+; RV32I-NEXT:    sw a0, 60(sp)
+; RV32I-NEXT:    sw a0, 56(sp)
+; RV32I-NEXT:    sw a0, 52(sp)
+; RV32I-NEXT:    sw a0, 48(sp)
+; RV32I-NEXT:    sw a0, 44(sp)
+; RV32I-NEXT:    sw a0, 40(sp)
+; RV32I-NEXT:    sw a0, 36(sp)
+; RV32I-NEXT:    sw a0, 32(sp)
+; RV32I-NEXT:    sw t2, 28(sp)
+; RV32I-NEXT:    sw t1, 24(sp)
+; RV32I-NEXT:    sw t0, 20(sp)
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    andi a0, a1, 28
+; RV32I-NEXT:    mv a3, sp
+; RV32I-NEXT:    add a5, a3, a0
+; RV32I-NEXT:    lw a3, 4(a5)
+; RV32I-NEXT:    slli a6, a1, 3
+; RV32I-NEXT:    srl a4, a3, a6
+; RV32I-NEXT:    lw a7, 8(a5)
+; RV32I-NEXT:    andi a0, a6, 24
+; RV32I-NEXT:    xori t0, a0, 31
+; RV32I-NEXT:    lw a1, 0(a5)
+; RV32I-NEXT:    slli a0, a7, 1
+; RV32I-NEXT:    sll a0, a0, t0
+; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    srl t1, a1, a6
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    lw t2, 12(a5)
+; RV32I-NEXT:    lw t3, 16(a5)
+; RV32I-NEXT:    sll a1, a3, t0
+; RV32I-NEXT:    or a1, t1, a1
+; RV32I-NEXT:    srl t4, t2, a6
+; RV32I-NEXT:    slli a3, t3, 1
+; RV32I-NEXT:    sll a3, a3, t0
+; RV32I-NEXT:    or a3, t4, a3
+; RV32I-NEXT:    srl a7, a7, a6
+; RV32I-NEXT:    slli t2, t2, 1
+; RV32I-NEXT:    lw t5, 20(a5)
+; RV32I-NEXT:    lw t6, 24(a5)
+; RV32I-NEXT:    sll t2, t2, t0
+; RV32I-NEXT:    or t2, a7, t2
+; RV32I-NEXT:    srl s0, t5, a6
+; RV32I-NEXT:    slli s1, t6, 1
+; RV32I-NEXT:    sll s1, s1, t0
+; RV32I-NEXT:    or s1, s0, s1
+; RV32I-NEXT:    srl t3, t3, a6
+; RV32I-NEXT:    slli t5, t5, 1
+; RV32I-NEXT:    lw a5, 28(a5)
+; RV32I-NEXT:    sll t5, t5, t0
+; RV32I-NEXT:    or t5, t3, t5
+; RV32I-NEXT:    srl t6, t6, a6
+; RV32I-NEXT:    slli s2, a5, 1
+; RV32I-NEXT:    sll t0, s2, t0
+; RV32I-NEXT:    or t0, t6, t0
+; RV32I-NEXT:    sra a5, a5, a6
+; RV32I-NEXT:    sb t6, 24(a2)
+; RV32I-NEXT:    sb a5, 28(a2)
+; RV32I-NEXT:    sb t3, 16(a2)
+; RV32I-NEXT:    sb s0, 20(a2)
+; RV32I-NEXT:    sb a7, 8(a2)
+; RV32I-NEXT:    sb t4, 12(a2)
+; RV32I-NEXT:    sb t1, 0(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli a4, a5, 24
+; RV32I-NEXT:    sb a4, 31(a2)
+; RV32I-NEXT:    srli a4, a5, 16
+; RV32I-NEXT:    sb a4, 30(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 29(a2)
+; RV32I-NEXT:    srli a4, t0, 24
+; RV32I-NEXT:    sb a4, 27(a2)
+; RV32I-NEXT:    srli a4, t0, 16
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    srli a4, t0, 8
+; RV32I-NEXT:    sb a4, 25(a2)
+; RV32I-NEXT:    srli a4, t5, 24
+; RV32I-NEXT:    sb a4, 19(a2)
+; RV32I-NEXT:    srli a4, t5, 16
+; RV32I-NEXT:    sb a4, 18(a2)
+; RV32I-NEXT:    srli a4, t5, 8
+; RV32I-NEXT:    sb a4, 17(a2)
+; RV32I-NEXT:    srli a4, s1, 24
+; RV32I-NEXT:    sb a4, 23(a2)
+; RV32I-NEXT:    srli a4, s1, 16
+; RV32I-NEXT:    sb a4, 22(a2)
+; RV32I-NEXT:    srli s1, s1, 8
+; RV32I-NEXT:    sb s1, 21(a2)
+; RV32I-NEXT:    srli a4, t2, 24
+; RV32I-NEXT:    sb a4, 11(a2)
+; RV32I-NEXT:    srli a4, t2, 16
+; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    srli a4, t2, 8
+; RV32I-NEXT:    sb a4, 9(a2)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 15(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 14(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    lw s0, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 80
+; RV32I-NEXT:    ret
+  %src = load i256, ptr %src.ptr, align 1
+  %byteOff = load i256, ptr %byteOff.ptr, align 1
+  %bitOff = shl i256 %byteOff, 3
+  %res = ashr i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_32bytes_wordOff:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -64
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a5, a5, 32
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a7, a0, 32
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 1(a1)
+; RV64I-NEXT:    lbu t0, 0(a1)
+; RV64I-NEXT:    lbu t1, 2(a1)
+; RV64I-NEXT:    lbu t2, 3(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or t0, t2, t1
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 4(a1)
+; RV64I-NEXT:    lbu t2, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t0, t0, 8
+; RV64I-NEXT:    or t0, t0, t1
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t2
+; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    sraiw a0, a0, 31
+; RV64I-NEXT:    sd a0, 56(sp)
+; RV64I-NEXT:    sd a0, 48(sp)
+; RV64I-NEXT:    sd a0, 40(sp)
+; RV64I-NEXT:    sd a0, 32(sp)
+; RV64I-NEXT:    sd a6, 24(sp)
+; RV64I-NEXT:    sd a5, 16(sp)
+; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    slli a0, a1, 2
+; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    mv a3, sp
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    slli a5, a1, 5
+; RV64I-NEXT:    srl a1, a4, a5
+; RV64I-NEXT:    ld a6, 16(a3)
+; RV64I-NEXT:    andi a0, a5, 32
+; RV64I-NEXT:    xori a7, a0, 63
+; RV64I-NEXT:    ld t0, 0(a3)
+; RV64I-NEXT:    slli a0, a6, 1
+; RV64I-NEXT:    sll a0, a0, a7
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    srl t0, t0, a5
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    ld a3, 24(a3)
+; RV64I-NEXT:    sll a4, a4, a7
+; RV64I-NEXT:    or a4, t0, a4
+; RV64I-NEXT:    srl a6, a6, a5
+; RV64I-NEXT:    slli t1, a3, 1
+; RV64I-NEXT:    sll a7, t1, a7
+; RV64I-NEXT:    or a7, a6, a7
+; RV64I-NEXT:    sra a3, a3, a5
+; RV64I-NEXT:    sb a6, 16(a2)
+; RV64I-NEXT:    sb a3, 24(a2)
+; RV64I-NEXT:    sb t0, 0(a2)
+; RV64I-NEXT:    sb a1, 8(a2)
+; RV64I-NEXT:    srli a5, a6, 24
+; RV64I-NEXT:    sb a5, 19(a2)
+; RV64I-NEXT:    srli a5, a6, 16
+; RV64I-NEXT:    sb a5, 18(a2)
+; RV64I-NEXT:    srli a5, a6, 8
+; RV64I-NEXT:    sb a5, 17(a2)
+; RV64I-NEXT:    srli a5, a3, 56
+; RV64I-NEXT:    sb a5, 31(a2)
+; RV64I-NEXT:    srli a5, a3, 48
+; RV64I-NEXT:    sb a5, 30(a2)
+; RV64I-NEXT:    srli a5, a3, 40
+; RV64I-NEXT:    sb a5, 29(a2)
+; RV64I-NEXT:    srli a5, a3, 32
+; RV64I-NEXT:    sb a5, 28(a2)
+; RV64I-NEXT:    srli a5, a3, 24
+; RV64I-NEXT:    sb a5, 27(a2)
+; RV64I-NEXT:    srli a5, a3, 16
+; RV64I-NEXT:    sb a5, 26(a2)
+; RV64I-NEXT:    srli a3, a3, 8
+; RV64I-NEXT:    sb a3, 25(a2)
+; RV64I-NEXT:    srli a3, t0, 24
+; RV64I-NEXT:    sb a3, 3(a2)
+; RV64I-NEXT:    srli a3, t0, 16
+; RV64I-NEXT:    sb a3, 2(a2)
+; RV64I-NEXT:    srli a3, t0, 8
+; RV64I-NEXT:    sb a3, 1(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 11(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 10(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 9(a2)
+; RV64I-NEXT:    srli a1, a7, 56
+; RV64I-NEXT:    sb a1, 23(a2)
+; RV64I-NEXT:    srli a1, a7, 48
+; RV64I-NEXT:    sb a1, 22(a2)
+; RV64I-NEXT:    srli a1, a7, 40
+; RV64I-NEXT:    sb a1, 21(a2)
+; RV64I-NEXT:    srli a1, a7, 32
+; RV64I-NEXT:    sb a1, 20(a2)
+; RV64I-NEXT:    srli a1, a4, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a1, a4, 48
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    srli a1, a4, 40
+; RV64I-NEXT:    sb a1, 5(a2)
+; RV64I-NEXT:    srli a4, a4, 32
+; RV64I-NEXT:    sb a4, 4(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sb a0, 12(a2)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 11(a2)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 10(a2)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 9(a2)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 224
+; RV64I-NEXT:    addi sp, sp, 64
 ; RV64I-NEXT:    ret
 ;
-; RV32I-LABEL: shl_32bytes:
+; RV32I-LABEL: ashr_32bytes_wordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -144
-; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -64
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
-; RV32I-NEXT:    lbu s3, 15(a0)
-; RV32I-NEXT:    lbu s4, 16(a0)
-; RV32I-NEXT:    lbu s5, 17(a0)
-; RV32I-NEXT:    lbu s6, 18(a0)
-; RV32I-NEXT:    lbu s7, 19(a0)
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s10, 22(a0)
-; RV32I-NEXT:    lbu s11, 23(a0)
-; RV32I-NEXT:    lbu ra, 24(a0)
-; RV32I-NEXT:    lbu t0, 25(a0)
-; RV32I-NEXT:    lbu a7, 26(a0)
-; RV32I-NEXT:    lbu a6, 27(a0)
-; RV32I-NEXT:    lbu a5, 28(a0)
-; RV32I-NEXT:    lbu a3, 31(a0)
-; RV32I-NEXT:    lbu a4, 30(a0)
-; RV32I-NEXT:    lbu a0, 29(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or t3, a0, t4
+; RV32I-NEXT:    or t2, t3, t2
 ; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sb a3, 91(sp)
-; RV32I-NEXT:    sb a4, 90(sp)
-; RV32I-NEXT:    sb a0, 89(sp)
-; RV32I-NEXT:    sb a5, 88(sp)
-; RV32I-NEXT:    sb a6, 87(sp)
-; RV32I-NEXT:    sb a7, 86(sp)
-; RV32I-NEXT:    sb zero, 59(sp)
-; RV32I-NEXT:    sb zero, 58(sp)
-; RV32I-NEXT:    sb zero, 57(sp)
-; RV32I-NEXT:    sb zero, 56(sp)
-; RV32I-NEXT:    sb zero, 55(sp)
-; RV32I-NEXT:    sb zero, 54(sp)
-; RV32I-NEXT:    sb zero, 53(sp)
-; RV32I-NEXT:    sb zero, 52(sp)
-; RV32I-NEXT:    sb zero, 51(sp)
-; RV32I-NEXT:    sb zero, 50(sp)
-; RV32I-NEXT:    sb zero, 49(sp)
-; RV32I-NEXT:    sb zero, 48(sp)
-; RV32I-NEXT:    sb zero, 47(sp)
-; RV32I-NEXT:    sb zero, 46(sp)
-; RV32I-NEXT:    sb zero, 45(sp)
-; RV32I-NEXT:    sb zero, 44(sp)
-; RV32I-NEXT:    sb zero, 43(sp)
-; RV32I-NEXT:    sb zero, 42(sp)
-; RV32I-NEXT:    sb zero, 41(sp)
-; RV32I-NEXT:    sb zero, 40(sp)
-; RV32I-NEXT:    sb zero, 39(sp)
-; RV32I-NEXT:    sb zero, 38(sp)
-; RV32I-NEXT:    sb zero, 37(sp)
-; RV32I-NEXT:    sb zero, 36(sp)
-; RV32I-NEXT:    sb zero, 35(sp)
-; RV32I-NEXT:    sb zero, 34(sp)
-; RV32I-NEXT:    sb zero, 33(sp)
-; RV32I-NEXT:    sb zero, 32(sp)
-; RV32I-NEXT:    sb zero, 31(sp)
-; RV32I-NEXT:    sb zero, 30(sp)
-; RV32I-NEXT:    sb zero, 29(sp)
-; RV32I-NEXT:    sb zero, 28(sp)
-; RV32I-NEXT:    sb t0, 85(sp)
-; RV32I-NEXT:    sb ra, 84(sp)
-; RV32I-NEXT:    sb s11, 83(sp)
-; RV32I-NEXT:    sb s10, 82(sp)
-; RV32I-NEXT:    sb s9, 81(sp)
-; RV32I-NEXT:    sb s8, 80(sp)
-; RV32I-NEXT:    sb s7, 79(sp)
-; RV32I-NEXT:    sb s6, 78(sp)
-; RV32I-NEXT:    sb s5, 77(sp)
-; RV32I-NEXT:    sb s4, 76(sp)
-; RV32I-NEXT:    sb s3, 75(sp)
-; RV32I-NEXT:    sb s2, 74(sp)
-; RV32I-NEXT:    sb s1, 73(sp)
-; RV32I-NEXT:    sb s0, 72(sp)
-; RV32I-NEXT:    sb t6, 71(sp)
-; RV32I-NEXT:    sb t5, 70(sp)
-; RV32I-NEXT:    sb t4, 69(sp)
-; RV32I-NEXT:    sb t3, 68(sp)
-; RV32I-NEXT:    sb t2, 67(sp)
-; RV32I-NEXT:    sb t1, 66(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 65(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 64(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 63(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 62(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 61(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 60(sp)
-; RV32I-NEXT:    andi a1, a1, 31
-; RV32I-NEXT:    addi a0, sp, 60
-; RV32I-NEXT:    sub a6, a0, a1
-; RV32I-NEXT:    lbu a0, 6(a6)
-; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 7(a6)
-; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 4(a6)
-; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 5(a6)
-; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 0(a6)
-; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 1(a6)
-; RV32I-NEXT:    lbu t0, 2(a6)
-; RV32I-NEXT:    lbu t1, 3(a6)
-; RV32I-NEXT:    lbu t2, 14(a6)
-; RV32I-NEXT:    lbu t3, 15(a6)
-; RV32I-NEXT:    lbu t4, 12(a6)
-; RV32I-NEXT:    lbu t5, 13(a6)
-; RV32I-NEXT:    lbu t6, 10(a6)
-; RV32I-NEXT:    lbu s0, 11(a6)
-; RV32I-NEXT:    lbu s1, 8(a6)
-; RV32I-NEXT:    lbu s2, 9(a6)
-; RV32I-NEXT:    lbu s3, 22(a6)
-; RV32I-NEXT:    lbu s4, 23(a6)
-; RV32I-NEXT:    lbu s5, 20(a6)
-; RV32I-NEXT:    lbu s6, 21(a6)
-; RV32I-NEXT:    lbu s7, 18(a6)
-; RV32I-NEXT:    lbu s8, 19(a6)
-; RV32I-NEXT:    lbu s9, 16(a6)
-; RV32I-NEXT:    lbu s10, 17(a6)
-; RV32I-NEXT:    lbu s11, 30(a6)
-; RV32I-NEXT:    lbu ra, 31(a6)
-; RV32I-NEXT:    lbu a5, 28(a6)
-; RV32I-NEXT:    lbu a4, 29(a6)
-; RV32I-NEXT:    lbu a0, 25(a6)
-; RV32I-NEXT:    lbu a1, 24(a6)
-; RV32I-NEXT:    lbu a3, 27(a6)
-; RV32I-NEXT:    lbu a6, 26(a6)
-; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    sb a1, 24(a2)
-; RV32I-NEXT:    sb a3, 27(a2)
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    sb a4, 29(a2)
-; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    sb ra, 31(a2)
-; RV32I-NEXT:    sb s11, 30(a2)
-; RV32I-NEXT:    sb s10, 17(a2)
-; RV32I-NEXT:    sb s9, 16(a2)
-; RV32I-NEXT:    sb s8, 19(a2)
-; RV32I-NEXT:    sb s7, 18(a2)
-; RV32I-NEXT:    sb s6, 21(a2)
-; RV32I-NEXT:    sb s5, 20(a2)
-; RV32I-NEXT:    sb s4, 23(a2)
-; RV32I-NEXT:    sb s3, 22(a2)
-; RV32I-NEXT:    sb s2, 9(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb t4, 12(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    srai a0, a0, 31
+; RV32I-NEXT:    sw a0, 60(sp)
+; RV32I-NEXT:    sw a0, 56(sp)
+; RV32I-NEXT:    sw a0, 52(sp)
+; RV32I-NEXT:    sw a0, 48(sp)
+; RV32I-NEXT:    sw a0, 44(sp)
+; RV32I-NEXT:    sw a0, 40(sp)
+; RV32I-NEXT:    sw a0, 36(sp)
+; RV32I-NEXT:    sw a0, 32(sp)
+; RV32I-NEXT:    sw t2, 28(sp)
+; RV32I-NEXT:    sw t1, 24(sp)
+; RV32I-NEXT:    sw t0, 20(sp)
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    andi a1, a1, 28
+; RV32I-NEXT:    mv a0, sp
+; RV32I-NEXT:    add a3, a0, a1
+; RV32I-NEXT:    lw a0, 4(a3)
+; RV32I-NEXT:    lw a1, 0(a3)
+; RV32I-NEXT:    lw a4, 12(a3)
+; RV32I-NEXT:    lw a5, 8(a3)
+; RV32I-NEXT:    lw a6, 24(a3)
+; RV32I-NEXT:    lw a7, 28(a3)
+; RV32I-NEXT:    lw t0, 16(a3)
+; RV32I-NEXT:    lw a3, 20(a3)
+; RV32I-NEXT:    sb a6, 24(a2)
+; RV32I-NEXT:    sb a7, 28(a2)
+; RV32I-NEXT:    sb t0, 16(a2)
+; RV32I-NEXT:    sb a3, 20(a2)
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a4, 12(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 144
+; RV32I-NEXT:    srli t1, a6, 24
+; RV32I-NEXT:    sb t1, 27(a2)
+; RV32I-NEXT:    srli t1, a6, 16
+; RV32I-NEXT:    sb t1, 26(a2)
+; RV32I-NEXT:    srli a6, a6, 8
+; RV32I-NEXT:    sb a6, 25(a2)
+; RV32I-NEXT:    srli a6, a7, 24
+; RV32I-NEXT:    sb a6, 31(a2)
+; RV32I-NEXT:    srli a6, a7, 16
+; RV32I-NEXT:    sb a6, 30(a2)
+; RV32I-NEXT:    srli a6, a7, 8
+; RV32I-NEXT:    sb a6, 29(a2)
+; RV32I-NEXT:    srli a6, t0, 24
+; RV32I-NEXT:    sb a6, 19(a2)
+; RV32I-NEXT:    srli a6, t0, 16
+; RV32I-NEXT:    sb a6, 18(a2)
+; RV32I-NEXT:    srli a6, t0, 8
+; RV32I-NEXT:    sb a6, 17(a2)
+; RV32I-NEXT:    srli a6, a3, 24
+; RV32I-NEXT:    sb a6, 23(a2)
+; RV32I-NEXT:    srli a6, a3, 16
+; RV32I-NEXT:    sb a6, 22(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 21(a2)
+; RV32I-NEXT:    srli a3, a5, 24
+; RV32I-NEXT:    sb a3, 11(a2)
+; RV32I-NEXT:    srli a3, a5, 16
+; RV32I-NEXT:    sb a3, 10(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a3, a4, 24
+; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    srli a3, a4, 16
+; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 13(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 64
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
-  %byteOff = load i256, ptr %byteOff.ptr, align 1
-  %bitOff = shl i256 %byteOff, 3
-  %res = shl i256 %src, %bitOff
+  %wordOff = load i256, ptr %wordOff.ptr, align 1
+  %bitOff = shl i256 %wordOff, 5
+  %res = ashr i256 %src, %bitOff
   store i256 %res, ptr %dst, align 1
   ret void
 }
-define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; RV64I-LABEL: ashr_32bytes:
+
+define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_32bytes_dwordOff:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -224
-; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv t0, a1
-; RV64I-NEXT:    lbu t1, 31(a0)
-; RV64I-NEXT:    lbu a1, 0(a0)
-; RV64I-NEXT:    sd a1, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 1(a0)
-; RV64I-NEXT:    sd a1, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 2(a0)
-; RV64I-NEXT:    sd a1, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 3(a0)
-; RV64I-NEXT:    sd a1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 4(a0)
-; RV64I-NEXT:    sd a1, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a1, 5(a0)
-; RV64I-NEXT:    sd a1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t2, 6(a0)
-; RV64I-NEXT:    lbu t3, 7(a0)
-; RV64I-NEXT:    lbu t4, 8(a0)
-; RV64I-NEXT:    lbu t5, 9(a0)
-; RV64I-NEXT:    lbu t6, 10(a0)
-; RV64I-NEXT:    lbu s0, 11(a0)
-; RV64I-NEXT:    lbu s1, 12(a0)
-; RV64I-NEXT:    lbu s2, 13(a0)
-; RV64I-NEXT:    lbu s3, 14(a0)
-; RV64I-NEXT:    lbu s4, 15(a0)
-; RV64I-NEXT:    lbu s5, 16(a0)
-; RV64I-NEXT:    lbu s6, 17(a0)
-; RV64I-NEXT:    lbu s7, 18(a0)
-; RV64I-NEXT:    lbu s8, 19(a0)
-; RV64I-NEXT:    lbu s9, 20(a0)
-; RV64I-NEXT:    lbu s10, 21(a0)
-; RV64I-NEXT:    lbu s11, 22(a0)
-; RV64I-NEXT:    lbu ra, 23(a0)
-; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    addi sp, sp, -64
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a5, a5, 32
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    lbu a6, 25(a0)
-; RV64I-NEXT:    lbu a5, 26(a0)
-; RV64I-NEXT:    lbu a4, 27(a0)
-; RV64I-NEXT:    lbu a1, 30(a0)
-; RV64I-NEXT:    lbu a3, 29(a0)
-; RV64I-NEXT:    lbu a0, 28(a0)
-; RV64I-NEXT:    lbu t0, 0(t0)
-; RV64I-NEXT:    sb a1, 86(sp)
-; RV64I-NEXT:    sb a3, 85(sp)
-; RV64I-NEXT:    sb a0, 84(sp)
-; RV64I-NEXT:    sb a4, 83(sp)
-; RV64I-NEXT:    sb a5, 82(sp)
-; RV64I-NEXT:    sb a6, 81(sp)
-; RV64I-NEXT:    sb t1, 87(sp)
-; RV64I-NEXT:    slli t1, t1, 56
-; RV64I-NEXT:    sb a7, 80(sp)
-; RV64I-NEXT:    sb ra, 79(sp)
-; RV64I-NEXT:    sb s11, 78(sp)
-; RV64I-NEXT:    sb s10, 77(sp)
-; RV64I-NEXT:    sb s9, 76(sp)
-; RV64I-NEXT:    sb s8, 75(sp)
-; RV64I-NEXT:    sb s7, 74(sp)
-; RV64I-NEXT:    sb s6, 73(sp)
-; RV64I-NEXT:    sb s5, 72(sp)
-; RV64I-NEXT:    sb s4, 71(sp)
-; RV64I-NEXT:    sb s3, 70(sp)
-; RV64I-NEXT:    sb s2, 69(sp)
-; RV64I-NEXT:    sb s1, 68(sp)
-; RV64I-NEXT:    sb s0, 67(sp)
-; RV64I-NEXT:    sb t6, 66(sp)
-; RV64I-NEXT:    sb t5, 65(sp)
-; RV64I-NEXT:    sb t4, 64(sp)
-; RV64I-NEXT:    sb t3, 63(sp)
-; RV64I-NEXT:    sb t2, 62(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 61(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 60(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 59(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 58(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 57(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    srai a0, t1, 63
-; RV64I-NEXT:    sb a0, 112(sp)
-; RV64I-NEXT:    sb a0, 104(sp)
-; RV64I-NEXT:    sb a0, 96(sp)
-; RV64I-NEXT:    sb a0, 88(sp)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a7, a0, 32
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a1, 0(a1)
+; RV64I-NEXT:    sraiw a0, a0, 31
+; RV64I-NEXT:    sd a0, 56(sp)
+; RV64I-NEXT:    sd a0, 48(sp)
+; RV64I-NEXT:    sd a0, 40(sp)
+; RV64I-NEXT:    sd a0, 32(sp)
+; RV64I-NEXT:    sd a6, 24(sp)
+; RV64I-NEXT:    sd a5, 16(sp)
+; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    andi a1, a1, 24
+; RV64I-NEXT:    mv a0, sp
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a1, 16(a0)
+; RV64I-NEXT:    ld a3, 24(a0)
+; RV64I-NEXT:    ld a4, 0(a0)
+; RV64I-NEXT:    ld a0, 8(a0)
+; RV64I-NEXT:    sb a1, 16(a2)
+; RV64I-NEXT:    sb a3, 24(a2)
+; RV64I-NEXT:    sb a4, 0(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a5, a1, 56
+; RV64I-NEXT:    sb a5, 23(a2)
+; RV64I-NEXT:    srli a5, a1, 48
+; RV64I-NEXT:    sb a5, 22(a2)
+; RV64I-NEXT:    srli a5, a1, 40
+; RV64I-NEXT:    sb a5, 21(a2)
+; RV64I-NEXT:    srli a5, a1, 32
+; RV64I-NEXT:    sb a5, 20(a2)
+; RV64I-NEXT:    srli a5, a1, 24
+; RV64I-NEXT:    sb a5, 19(a2)
+; RV64I-NEXT:    srli a5, a1, 16
+; RV64I-NEXT:    sb a5, 18(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 17(a2)
+; RV64I-NEXT:    srli a1, a3, 56
+; RV64I-NEXT:    sb a1, 31(a2)
+; RV64I-NEXT:    srli a1, a3, 48
+; RV64I-NEXT:    sb a1, 30(a2)
+; RV64I-NEXT:    srli a1, a3, 40
+; RV64I-NEXT:    sb a1, 29(a2)
+; RV64I-NEXT:    srli a1, a3, 32
+; RV64I-NEXT:    sb a1, 28(a2)
+; RV64I-NEXT:    srli a1, a3, 24
+; RV64I-NEXT:    sb a1, 27(a2)
+; RV64I-NEXT:    srli a1, a3, 16
+; RV64I-NEXT:    sb a1, 26(a2)
+; RV64I-NEXT:    srli a3, a3, 8
+; RV64I-NEXT:    sb a3, 25(a2)
+; RV64I-NEXT:    srli a1, a4, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a1, a4, 48
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    srli a1, a4, 40
+; RV64I-NEXT:    sb a1, 5(a2)
+; RV64I-NEXT:    srli a1, a4, 32
+; RV64I-NEXT:    sb a1, 4(a2)
+; RV64I-NEXT:    srli a1, a4, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a1, a4, 16
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 1(a2)
 ; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    sb a1, 119(sp)
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    sb a3, 118(sp)
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    sb a4, 117(sp)
-; RV64I-NEXT:    srli a5, a0, 32
-; RV64I-NEXT:    sb a5, 116(sp)
-; RV64I-NEXT:    srli a6, a0, 24
-; RV64I-NEXT:    sb a6, 115(sp)
-; RV64I-NEXT:    srli a7, a0, 16
-; RV64I-NEXT:    sb a7, 114(sp)
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
 ; RV64I-NEXT:    srli a0, a0, 8
-; RV64I-NEXT:    sb a0, 113(sp)
-; RV64I-NEXT:    sb a1, 111(sp)
-; RV64I-NEXT:    sb a3, 110(sp)
-; RV64I-NEXT:    sb a4, 109(sp)
-; RV64I-NEXT:    sb a5, 108(sp)
-; RV64I-NEXT:    sb a6, 107(sp)
-; RV64I-NEXT:    sb a7, 106(sp)
-; RV64I-NEXT:    sb a0, 105(sp)
-; RV64I-NEXT:    sb a1, 103(sp)
-; RV64I-NEXT:    sb a3, 102(sp)
-; RV64I-NEXT:    sb a4, 101(sp)
-; RV64I-NEXT:    sb a5, 100(sp)
-; RV64I-NEXT:    sb a6, 99(sp)
-; RV64I-NEXT:    sb a7, 98(sp)
-; RV64I-NEXT:    sb a0, 97(sp)
-; RV64I-NEXT:    sb a1, 95(sp)
-; RV64I-NEXT:    sb a3, 94(sp)
-; RV64I-NEXT:    sb a4, 93(sp)
-; RV64I-NEXT:    sb a5, 92(sp)
-; RV64I-NEXT:    sb a6, 91(sp)
-; RV64I-NEXT:    sb a7, 90(sp)
-; RV64I-NEXT:    sb a0, 89(sp)
-; RV64I-NEXT:    andi a0, t0, 31
-; RV64I-NEXT:    addi a1, sp, 56
-; RV64I-NEXT:    add a6, a1, a0
-; RV64I-NEXT:    lbu a0, 8(a6)
-; RV64I-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 9(a6)
-; RV64I-NEXT:    sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 10(a6)
-; RV64I-NEXT:    sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 11(a6)
-; RV64I-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a0, 12(a6)
-; RV64I-NEXT:    sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a7, 13(a6)
-; RV64I-NEXT:    lbu t0, 14(a6)
-; RV64I-NEXT:    lbu t1, 15(a6)
-; RV64I-NEXT:    lbu t2, 0(a6)
-; RV64I-NEXT:    lbu t3, 1(a6)
-; RV64I-NEXT:    lbu t4, 2(a6)
-; RV64I-NEXT:    lbu t5, 3(a6)
-; RV64I-NEXT:    lbu t6, 4(a6)
-; RV64I-NEXT:    lbu s0, 5(a6)
-; RV64I-NEXT:    lbu s1, 6(a6)
-; RV64I-NEXT:    lbu s2, 7(a6)
-; RV64I-NEXT:    lbu s3, 24(a6)
-; RV64I-NEXT:    lbu s4, 25(a6)
-; RV64I-NEXT:    lbu s5, 26(a6)
-; RV64I-NEXT:    lbu s6, 27(a6)
-; RV64I-NEXT:    lbu s7, 28(a6)
-; RV64I-NEXT:    lbu s8, 29(a6)
-; RV64I-NEXT:    lbu s9, 30(a6)
-; RV64I-NEXT:    lbu s10, 31(a6)
-; RV64I-NEXT:    lbu s11, 16(a6)
-; RV64I-NEXT:    lbu ra, 17(a6)
-; RV64I-NEXT:    lbu a5, 18(a6)
-; RV64I-NEXT:    lbu a4, 19(a6)
-; RV64I-NEXT:    lbu a0, 23(a6)
-; RV64I-NEXT:    lbu a1, 22(a6)
-; RV64I-NEXT:    lbu a3, 21(a6)
-; RV64I-NEXT:    lbu a6, 20(a6)
-; RV64I-NEXT:    sb a0, 23(a2)
-; RV64I-NEXT:    sb a1, 22(a2)
-; RV64I-NEXT:    sb a3, 21(a2)
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb ra, 17(a2)
-; RV64I-NEXT:    sb s11, 16(a2)
-; RV64I-NEXT:    sb s10, 31(a2)
-; RV64I-NEXT:    sb s9, 30(a2)
-; RV64I-NEXT:    sb s8, 29(a2)
-; RV64I-NEXT:    sb s7, 28(a2)
-; RV64I-NEXT:    sb s6, 27(a2)
-; RV64I-NEXT:    sb s5, 26(a2)
-; RV64I-NEXT:    sb s4, 25(a2)
-; RV64I-NEXT:    sb s3, 24(a2)
-; RV64I-NEXT:    sb s2, 7(a2)
-; RV64I-NEXT:    sb s1, 6(a2)
-; RV64I-NEXT:    sb s0, 5(a2)
-; RV64I-NEXT:    sb t6, 4(a2)
-; RV64I-NEXT:    sb t5, 3(a2)
-; RV64I-NEXT:    sb t4, 2(a2)
-; RV64I-NEXT:    sb t3, 1(a2)
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb t1, 15(a2)
-; RV64I-NEXT:    sb t0, 14(a2)
-; RV64I-NEXT:    sb a7, 13(a2)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 12(a2)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 11(a2)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 10(a2)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    sb a0, 9(a2)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 224
+; RV64I-NEXT:    addi sp, sp, 64
 ; RV64I-NEXT:    ret
 ;
-; RV32I-LABEL: ashr_32bytes:
+; RV32I-LABEL: ashr_32bytes_dwordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -144
-; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv t0, a1
-; RV32I-NEXT:    lbu t1, 31(a0)
-; RV32I-NEXT:    lbu a1, 0(a0)
-; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 1(a0)
-; RV32I-NEXT:    sw a1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 2(a0)
-; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 3(a0)
-; RV32I-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 4(a0)
-; RV32I-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a1, 5(a0)
-; RV32I-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t2, 6(a0)
-; RV32I-NEXT:    lbu t3, 7(a0)
-; RV32I-NEXT:    lbu t4, 8(a0)
-; RV32I-NEXT:    lbu t5, 9(a0)
-; RV32I-NEXT:    lbu t6, 10(a0)
-; RV32I-NEXT:    lbu s0, 11(a0)
-; RV32I-NEXT:    lbu s1, 12(a0)
-; RV32I-NEXT:    lbu s2, 13(a0)
-; RV32I-NEXT:    lbu s3, 14(a0)
-; RV32I-NEXT:    lbu s4, 15(a0)
-; RV32I-NEXT:    lbu s5, 16(a0)
-; RV32I-NEXT:    lbu s6, 17(a0)
-; RV32I-NEXT:    lbu s7, 18(a0)
-; RV32I-NEXT:    lbu s8, 19(a0)
-; RV32I-NEXT:    lbu s9, 20(a0)
-; RV32I-NEXT:    lbu s10, 21(a0)
-; RV32I-NEXT:    lbu s11, 22(a0)
-; RV32I-NEXT:    lbu ra, 23(a0)
-; RV32I-NEXT:    lbu a7, 24(a0)
-; RV32I-NEXT:    lbu a6, 25(a0)
-; RV32I-NEXT:    lbu a5, 26(a0)
-; RV32I-NEXT:    lbu a4, 27(a0)
-; RV32I-NEXT:    lbu a1, 30(a0)
-; RV32I-NEXT:    lbu a3, 29(a0)
-; RV32I-NEXT:    lbu a0, 28(a0)
-; RV32I-NEXT:    lbu t0, 0(t0)
-; RV32I-NEXT:    sb a1, 58(sp)
-; RV32I-NEXT:    sb a3, 57(sp)
-; RV32I-NEXT:    sb a0, 56(sp)
-; RV32I-NEXT:    sb a4, 55(sp)
-; RV32I-NEXT:    sb a5, 54(sp)
-; RV32I-NEXT:    sb a6, 53(sp)
-; RV32I-NEXT:    sb t1, 59(sp)
+; RV32I-NEXT:    addi sp, sp, -64
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    sb a7, 52(sp)
-; RV32I-NEXT:    sb ra, 51(sp)
-; RV32I-NEXT:    sb s11, 50(sp)
-; RV32I-NEXT:    sb s10, 49(sp)
-; RV32I-NEXT:    sb s9, 48(sp)
-; RV32I-NEXT:    sb s8, 47(sp)
-; RV32I-NEXT:    sb s7, 46(sp)
-; RV32I-NEXT:    sb s6, 45(sp)
-; RV32I-NEXT:    sb s5, 44(sp)
-; RV32I-NEXT:    sb s4, 43(sp)
-; RV32I-NEXT:    sb s3, 42(sp)
-; RV32I-NEXT:    sb s2, 41(sp)
-; RV32I-NEXT:    sb s1, 40(sp)
-; RV32I-NEXT:    sb s0, 39(sp)
-; RV32I-NEXT:    sb t6, 38(sp)
-; RV32I-NEXT:    sb t5, 37(sp)
-; RV32I-NEXT:    sb t4, 36(sp)
-; RV32I-NEXT:    sb t3, 35(sp)
-; RV32I-NEXT:    sb t2, 34(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    srai a0, t1, 31
-; RV32I-NEXT:    sb a0, 88(sp)
-; RV32I-NEXT:    sb a0, 84(sp)
-; RV32I-NEXT:    sb a0, 80(sp)
-; RV32I-NEXT:    sb a0, 76(sp)
-; RV32I-NEXT:    sb a0, 72(sp)
-; RV32I-NEXT:    sb a0, 68(sp)
-; RV32I-NEXT:    sb a0, 64(sp)
-; RV32I-NEXT:    sb a0, 60(sp)
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    lbu t0, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or t0, t0, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    lbu t1, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    lbu t2, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t2, t2, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or t3, a0, t4
+; RV32I-NEXT:    or t2, t3, t2
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    srai a0, a0, 31
+; RV32I-NEXT:    sw a0, 60(sp)
+; RV32I-NEXT:    sw a0, 56(sp)
+; RV32I-NEXT:    sw a0, 52(sp)
+; RV32I-NEXT:    sw a0, 48(sp)
+; RV32I-NEXT:    sw a0, 44(sp)
+; RV32I-NEXT:    sw a0, 40(sp)
+; RV32I-NEXT:    sw a0, 36(sp)
+; RV32I-NEXT:    sw a0, 32(sp)
+; RV32I-NEXT:    sw t2, 28(sp)
+; RV32I-NEXT:    sw t1, 24(sp)
+; RV32I-NEXT:    sw t0, 20(sp)
+; RV32I-NEXT:    sw a7, 16(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    andi a1, a1, 24
+; RV32I-NEXT:    mv a0, sp
+; RV32I-NEXT:    add a3, a0, a1
+; RV32I-NEXT:    lw a0, 4(a3)
+; RV32I-NEXT:    lw a1, 0(a3)
+; RV32I-NEXT:    lw a4, 12(a3)
+; RV32I-NEXT:    lw a5, 8(a3)
+; RV32I-NEXT:    lw a6, 24(a3)
+; RV32I-NEXT:    lw a7, 28(a3)
+; RV32I-NEXT:    lw t0, 16(a3)
+; RV32I-NEXT:    lw a3, 20(a3)
+; RV32I-NEXT:    sb a6, 24(a2)
+; RV32I-NEXT:    sb a7, 28(a2)
+; RV32I-NEXT:    sb t0, 16(a2)
+; RV32I-NEXT:    sb a3, 20(a2)
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a4, 12(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli t1, a6, 24
+; RV32I-NEXT:    sb t1, 27(a2)
+; RV32I-NEXT:    srli t1, a6, 16
+; RV32I-NEXT:    sb t1, 26(a2)
+; RV32I-NEXT:    srli a6, a6, 8
+; RV32I-NEXT:    sb a6, 25(a2)
+; RV32I-NEXT:    srli a6, a7, 24
+; RV32I-NEXT:    sb a6, 31(a2)
+; RV32I-NEXT:    srli a6, a7, 16
+; RV32I-NEXT:    sb a6, 30(a2)
+; RV32I-NEXT:    srli a6, a7, 8
+; RV32I-NEXT:    sb a6, 29(a2)
+; RV32I-NEXT:    srli a6, t0, 24
+; RV32I-NEXT:    sb a6, 19(a2)
+; RV32I-NEXT:    srli a6, t0, 16
+; RV32I-NEXT:    sb a6, 18(a2)
+; RV32I-NEXT:    srli a6, t0, 8
+; RV32I-NEXT:    sb a6, 17(a2)
+; RV32I-NEXT:    srli a6, a3, 24
+; RV32I-NEXT:    sb a6, 23(a2)
+; RV32I-NEXT:    srli a6, a3, 16
+; RV32I-NEXT:    sb a6, 22(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 21(a2)
+; RV32I-NEXT:    srli a3, a5, 24
+; RV32I-NEXT:    sb a3, 11(a2)
+; RV32I-NEXT:    srli a3, a5, 16
+; RV32I-NEXT:    sb a3, 10(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a3, a4, 24
+; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    srli a3, a4, 16
+; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 13(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
 ; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    sb a1, 91(sp)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 90(sp)
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 89(sp)
-; RV32I-NEXT:    sb a1, 87(sp)
-; RV32I-NEXT:    sb a3, 86(sp)
-; RV32I-NEXT:    sb a0, 85(sp)
-; RV32I-NEXT:    sb a1, 83(sp)
-; RV32I-NEXT:    sb a3, 82(sp)
-; RV32I-NEXT:    sb a0, 81(sp)
-; RV32I-NEXT:    sb a1, 79(sp)
-; RV32I-NEXT:    sb a3, 78(sp)
-; RV32I-NEXT:    sb a0, 77(sp)
-; RV32I-NEXT:    sb a1, 75(sp)
-; RV32I-NEXT:    sb a3, 74(sp)
-; RV32I-NEXT:    sb a0, 73(sp)
-; RV32I-NEXT:    sb a1, 71(sp)
-; RV32I-NEXT:    sb a3, 70(sp)
-; RV32I-NEXT:    sb a0, 69(sp)
-; RV32I-NEXT:    sb a1, 67(sp)
-; RV32I-NEXT:    sb a3, 66(sp)
-; RV32I-NEXT:    sb a0, 65(sp)
-; RV32I-NEXT:    sb a1, 63(sp)
-; RV32I-NEXT:    sb a3, 62(sp)
-; RV32I-NEXT:    sb a0, 61(sp)
-; RV32I-NEXT:    andi a0, t0, 31
-; RV32I-NEXT:    addi a1, sp, 28
-; RV32I-NEXT:    add a6, a1, a0
-; RV32I-NEXT:    lbu a0, 6(a6)
-; RV32I-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 7(a6)
-; RV32I-NEXT:    sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 4(a6)
-; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 5(a6)
-; RV32I-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a0, 0(a6)
-; RV32I-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a7, 1(a6)
-; RV32I-NEXT:    lbu t0, 2(a6)
-; RV32I-NEXT:    lbu t1, 3(a6)
-; RV32I-NEXT:    lbu t2, 14(a6)
-; RV32I-NEXT:    lbu t3, 15(a6)
-; RV32I-NEXT:    lbu t4, 12(a6)
-; RV32I-NEXT:    lbu t5, 13(a6)
-; RV32I-NEXT:    lbu t6, 10(a6)
-; RV32I-NEXT:    lbu s0, 11(a6)
-; RV32I-NEXT:    lbu s1, 8(a6)
-; RV32I-NEXT:    lbu s2, 9(a6)
-; RV32I-NEXT:    lbu s3, 22(a6)
-; RV32I-NEXT:    lbu s4, 23(a6)
-; RV32I-NEXT:    lbu s5, 20(a6)
-; RV32I-NEXT:    lbu s6, 21(a6)
-; RV32I-NEXT:    lbu s7, 18(a6)
-; RV32I-NEXT:    lbu s8, 19(a6)
-; RV32I-NEXT:    lbu s9, 16(a6)
-; RV32I-NEXT:    lbu s10, 17(a6)
-; RV32I-NEXT:    lbu s11, 30(a6)
-; RV32I-NEXT:    lbu ra, 31(a6)
-; RV32I-NEXT:    lbu a5, 28(a6)
-; RV32I-NEXT:    lbu a4, 29(a6)
-; RV32I-NEXT:    lbu a0, 25(a6)
-; RV32I-NEXT:    lbu a1, 24(a6)
-; RV32I-NEXT:    lbu a3, 27(a6)
-; RV32I-NEXT:    lbu a6, 26(a6)
-; RV32I-NEXT:    sb a0, 25(a2)
-; RV32I-NEXT:    sb a1, 24(a2)
-; RV32I-NEXT:    sb a3, 27(a2)
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    sb a4, 29(a2)
-; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    sb ra, 31(a2)
-; RV32I-NEXT:    sb s11, 30(a2)
-; RV32I-NEXT:    sb s10, 17(a2)
-; RV32I-NEXT:    sb s9, 16(a2)
-; RV32I-NEXT:    sb s8, 19(a2)
-; RV32I-NEXT:    sb s7, 18(a2)
-; RV32I-NEXT:    sb s6, 21(a2)
-; RV32I-NEXT:    sb s5, 20(a2)
-; RV32I-NEXT:    sb s4, 23(a2)
-; RV32I-NEXT:    sb s3, 22(a2)
-; RV32I-NEXT:    sb s2, 9(a2)
-; RV32I-NEXT:    sb s1, 8(a2)
-; RV32I-NEXT:    sb s0, 11(a2)
-; RV32I-NEXT:    sb t6, 10(a2)
-; RV32I-NEXT:    sb t5, 13(a2)
-; RV32I-NEXT:    sb t4, 12(a2)
-; RV32I-NEXT:    sb t3, 15(a2)
-; RV32I-NEXT:    sb t2, 14(a2)
-; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 144
+; RV32I-NEXT:    addi sp, sp, 64
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
-  %byteOff = load i256, ptr %byteOff.ptr, align 1
-  %bitOff = shl i256 %byteOff, 3
+  %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+  %bitOff = shl i256 %dwordOff, 6
   %res = ashr i256 %src, %bitOff
   store i256 %res, ptr %dst, align 1
   ret void

diff  --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index a601256bc2afaa..7e879b137b4f0d 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -704,164 +704,117 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: lshr_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -64
-; RV32I-NEXT:    sw s0, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 1(a1)
-; RV32I-NEXT:    lbu s1, 0(a1)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    or s0, s0, s1
-; RV32I-NEXT:    lbu s1, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, s1
-; RV32I-NEXT:    or a1, a1, s0
-; RV32I-NEXT:    sb zero, 43(sp)
-; RV32I-NEXT:    sb zero, 42(sp)
-; RV32I-NEXT:    sb zero, 41(sp)
-; RV32I-NEXT:    sb zero, 40(sp)
-; RV32I-NEXT:    sb zero, 39(sp)
-; RV32I-NEXT:    sb zero, 38(sp)
-; RV32I-NEXT:    sb zero, 37(sp)
-; RV32I-NEXT:    sb zero, 36(sp)
-; RV32I-NEXT:    sb zero, 35(sp)
-; RV32I-NEXT:    sb zero, 34(sp)
-; RV32I-NEXT:    sb zero, 33(sp)
-; RV32I-NEXT:    sb zero, 32(sp)
-; RV32I-NEXT:    sb zero, 31(sp)
-; RV32I-NEXT:    sb zero, 30(sp)
-; RV32I-NEXT:    sb zero, 29(sp)
-; RV32I-NEXT:    sb zero, 28(sp)
-; RV32I-NEXT:    sb a0, 27(sp)
-; RV32I-NEXT:    sb s4, 26(sp)
-; RV32I-NEXT:    sb s3, 25(sp)
-; RV32I-NEXT:    sb s2, 24(sp)
-; RV32I-NEXT:    sb t6, 23(sp)
-; RV32I-NEXT:    sb t5, 22(sp)
-; RV32I-NEXT:    sb t4, 21(sp)
-; RV32I-NEXT:    sb t3, 20(sp)
-; RV32I-NEXT:    sb t2, 19(sp)
-; RV32I-NEXT:    sb t1, 18(sp)
-; RV32I-NEXT:    sb t0, 17(sp)
-; RV32I-NEXT:    sb a7, 16(sp)
-; RV32I-NEXT:    sb a6, 15(sp)
-; RV32I-NEXT:    sb a5, 14(sp)
-; RV32I-NEXT:    sb a4, 13(sp)
-; RV32I-NEXT:    sb a3, 12(sp)
-; RV32I-NEXT:    slli a0, a1, 25
-; RV32I-NEXT:    srli a0, a0, 28
-; RV32I-NEXT:    addi a3, sp, 12
-; RV32I-NEXT:    add a3, a3, a0
-; RV32I-NEXT:    lbu a0, 5(a3)
-; RV32I-NEXT:    lbu a4, 4(a3)
-; RV32I-NEXT:    lbu a5, 6(a3)
-; RV32I-NEXT:    lbu a6, 7(a3)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, a4, a0
-; RV32I-NEXT:    andi a4, a1, 7
-; RV32I-NEXT:    srl a0, a5, a4
-; RV32I-NEXT:    lbu a1, 9(a3)
-; RV32I-NEXT:    lbu a6, 8(a3)
-; RV32I-NEXT:    lbu a7, 10(a3)
-; RV32I-NEXT:    lbu t0, 11(a3)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
 ; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a6, a6, a1
-; RV32I-NEXT:    slli a1, a6, 1
-; RV32I-NEXT:    not a7, a4
-; RV32I-NEXT:    sll a1, a1, a7
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    lbu a7, 1(a3)
-; RV32I-NEXT:    lbu t0, 0(a3)
-; RV32I-NEXT:    lbu t1, 2(a3)
-; RV32I-NEXT:    lbu t2, 3(a3)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    srl a7, a7, a4
-; RV32I-NEXT:    slli a5, a5, 1
-; RV32I-NEXT:    xori t0, a4, 31
-; RV32I-NEXT:    sll a5, a5, t0
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    srl a6, a6, a4
-; RV32I-NEXT:    lbu t1, 13(a3)
-; RV32I-NEXT:    lbu t2, 12(a3)
-; RV32I-NEXT:    lbu t3, 14(a3)
-; RV32I-NEXT:    lbu a3, 15(a3)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    or a3, a3, t1
-; RV32I-NEXT:    slli t1, a3, 1
-; RV32I-NEXT:    sll t0, t1, t0
-; RV32I-NEXT:    or t0, a6, t0
-; RV32I-NEXT:    srl a3, a3, a4
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli a4, a6, 16
-; RV32I-NEXT:    sb a4, 10(a2)
-; RV32I-NEXT:    srli a4, a6, 8
-; RV32I-NEXT:    sb a4, 9(a2)
-; RV32I-NEXT:    srli a4, a3, 16
-; RV32I-NEXT:    sb a4, 14(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 13(a2)
-; RV32I-NEXT:    srli a3, a7, 16
-; RV32I-NEXT:    sb a3, 2(a2)
-; RV32I-NEXT:    srli a3, a7, 8
-; RV32I-NEXT:    sb a3, 1(a2)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 6(a2)
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu a7, 0(a1)
+; RV32I-NEXT:    lbu t0, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    srli a0, a1, 3
+; RV32I-NEXT:    andi a0, a0, 12
+; RV32I-NEXT:    mv a3, sp
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    lw a3, 4(a0)
+; RV32I-NEXT:    srl a4, a3, a1
+; RV32I-NEXT:    lw a5, 8(a0)
+; RV32I-NEXT:    andi a6, a1, 31
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    lw a7, 0(a0)
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or a4, a4, t0
+; RV32I-NEXT:    srl a7, a7, a1
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    sll a3, a3, a6
+; RV32I-NEXT:    or a3, a7, a3
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    slli a7, a0, 1
+; RV32I-NEXT:    sll a6, a7, a6
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    srl a0, a0, a1
+; RV32I-NEXT:    sb a0, 12(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 15(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    srli a0, t0, 24
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a3, 0(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli a0, a5, 16
+; RV32I-NEXT:    sb a0, 10(a2)
+; RV32I-NEXT:    srli a0, a5, 24
 ; RV32I-NEXT:    sb a0, 11(a2)
-; RV32I-NEXT:    srli a5, a5, 24
-; RV32I-NEXT:    sb a5, 3(a2)
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    sb a0, 2(a2)
+; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    sb a0, 3(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a0, a4, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, a4, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -987,164 +940,117 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: shl_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -64
-; RV32I-NEXT:    sw s0, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    lbu a7, 4(a0)
-; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 1(a1)
-; RV32I-NEXT:    lbu s1, 0(a1)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    or s0, s0, s1
-; RV32I-NEXT:    lbu s1, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, s1
-; RV32I-NEXT:    or a1, a1, s0
-; RV32I-NEXT:    sb zero, 27(sp)
-; RV32I-NEXT:    sb zero, 26(sp)
-; RV32I-NEXT:    sb zero, 25(sp)
-; RV32I-NEXT:    sb zero, 24(sp)
-; RV32I-NEXT:    sb zero, 23(sp)
-; RV32I-NEXT:    sb zero, 22(sp)
-; RV32I-NEXT:    sb zero, 21(sp)
-; RV32I-NEXT:    sb zero, 20(sp)
-; RV32I-NEXT:    sb zero, 19(sp)
-; RV32I-NEXT:    sb zero, 18(sp)
-; RV32I-NEXT:    sb zero, 17(sp)
-; RV32I-NEXT:    sb zero, 16(sp)
-; RV32I-NEXT:    sb zero, 15(sp)
-; RV32I-NEXT:    sb zero, 14(sp)
-; RV32I-NEXT:    sb zero, 13(sp)
-; RV32I-NEXT:    sb zero, 12(sp)
-; RV32I-NEXT:    sb a0, 43(sp)
-; RV32I-NEXT:    sb s4, 42(sp)
-; RV32I-NEXT:    sb s3, 41(sp)
-; RV32I-NEXT:    sb s2, 40(sp)
-; RV32I-NEXT:    sb t6, 39(sp)
-; RV32I-NEXT:    sb t5, 38(sp)
-; RV32I-NEXT:    sb t4, 37(sp)
-; RV32I-NEXT:    sb t3, 36(sp)
-; RV32I-NEXT:    sb t2, 35(sp)
-; RV32I-NEXT:    sb t1, 34(sp)
-; RV32I-NEXT:    sb t0, 33(sp)
-; RV32I-NEXT:    sb a7, 32(sp)
-; RV32I-NEXT:    sb a6, 31(sp)
-; RV32I-NEXT:    sb a5, 30(sp)
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a3, 28(sp)
-; RV32I-NEXT:    slli a0, a1, 25
-; RV32I-NEXT:    srli a0, a0, 28
-; RV32I-NEXT:    addi a3, sp, 28
-; RV32I-NEXT:    sub a3, a3, a0
-; RV32I-NEXT:    lbu a0, 5(a3)
-; RV32I-NEXT:    lbu a4, 4(a3)
-; RV32I-NEXT:    lbu a5, 6(a3)
-; RV32I-NEXT:    lbu a6, 7(a3)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, a4, a0
-; RV32I-NEXT:    andi a4, a1, 7
-; RV32I-NEXT:    sll a0, a5, a4
-; RV32I-NEXT:    lbu a1, 1(a3)
-; RV32I-NEXT:    lbu a6, 0(a3)
-; RV32I-NEXT:    lbu a7, 2(a3)
-; RV32I-NEXT:    lbu t0, 3(a3)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
 ; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a6, a6, a1
-; RV32I-NEXT:    srli a1, a6, 1
-; RV32I-NEXT:    xori a7, a4, 31
-; RV32I-NEXT:    srl a1, a1, a7
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    lbu t0, 13(a3)
-; RV32I-NEXT:    lbu t1, 12(a3)
-; RV32I-NEXT:    lbu t2, 14(a3)
-; RV32I-NEXT:    lbu t3, 15(a3)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t3, t2
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    sll t0, t0, a4
-; RV32I-NEXT:    lbu t1, 9(a3)
-; RV32I-NEXT:    lbu t2, 8(a3)
-; RV32I-NEXT:    lbu t3, 10(a3)
-; RV32I-NEXT:    lbu a3, 11(a3)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    or a3, a3, t1
-; RV32I-NEXT:    srli t1, a3, 1
-; RV32I-NEXT:    srl a7, t1, a7
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    sll a3, a3, a4
-; RV32I-NEXT:    srli a5, a5, 1
-; RV32I-NEXT:    not t1, a4
-; RV32I-NEXT:    srl a5, a5, t1
-; RV32I-NEXT:    or a5, a3, a5
-; RV32I-NEXT:    sll a4, a6, a4
-; RV32I-NEXT:    sb a4, 0(a2)
-; RV32I-NEXT:    srli a6, a3, 16
-; RV32I-NEXT:    sb a6, 10(a2)
-; RV32I-NEXT:    srli a6, a3, 24
-; RV32I-NEXT:    sb a6, 11(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 9(a2)
-; RV32I-NEXT:    srli a3, t0, 16
-; RV32I-NEXT:    sb a3, 14(a2)
-; RV32I-NEXT:    srli a3, t0, 24
-; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a3, t0, 8
-; RV32I-NEXT:    sb a3, 13(a2)
-; RV32I-NEXT:    srli a3, a4, 16
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    lbu a6, 1(a1)
+; RV32I-NEXT:    lbu a7, 0(a1)
+; RV32I-NEXT:    lbu t0, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw a5, 24(sp)
+; RV32I-NEXT:    sw a4, 20(sp)
+; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    srli a0, a1, 3
+; RV32I-NEXT:    andi a0, a0, 12
+; RV32I-NEXT:    addi a3, sp, 16
+; RV32I-NEXT:    sub a3, a3, a0
+; RV32I-NEXT:    lw a0, 4(a3)
+; RV32I-NEXT:    lw a4, 0(a3)
+; RV32I-NEXT:    sll a5, a0, a1
+; RV32I-NEXT:    andi a6, a1, 31
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    srli a7, a4, 1
+; RV32I-NEXT:    lw t0, 12(a3)
+; RV32I-NEXT:    lw a3, 8(a3)
+; RV32I-NEXT:    srl a7, a7, a6
+; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    sll a7, t0, a1
+; RV32I-NEXT:    srli t0, a3, 1
+; RV32I-NEXT:    srl t0, t0, a6
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    sll a3, a3, a1
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    srl a0, a0, a6
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    sll a1, a4, a1
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli a3, a1, 16
 ; RV32I-NEXT:    sb a3, 2(a2)
-; RV32I-NEXT:    srli a3, a4, 24
+; RV32I-NEXT:    srli a3, a1, 24
 ; RV32I-NEXT:    sb a3, 3(a2)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 1(a2)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 6(a2)
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    sb a3, 7(a2)
-; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    sb a0, 8(a2)
 ; RV32I-NEXT:    sb a7, 12(a2)
-; RV32I-NEXT:    sb a1, 4(a2)
-; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    sb a5, 4(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 10(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 11(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 9(a2)
+; RV32I-NEXT:    srli a0, a7, 16
+; RV32I-NEXT:    sb a0, 14(a2)
+; RV32I-NEXT:    srli a0, a7, 24
+; RV32I-NEXT:    sb a0, 15(a2)
+; RV32I-NEXT:    srli a0, a7, 8
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    srli a0, a5, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, a5, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1270,171 +1176,118 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; RV32I-LABEL: ashr_16bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -64
-; RV32I-NEXT:    sw s0, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 15(a0)
-; RV32I-NEXT:    slli a4, a3, 24
-; RV32I-NEXT:    lbu a5, 0(a0)
-; RV32I-NEXT:    lbu a6, 1(a0)
-; RV32I-NEXT:    lbu a7, 2(a0)
-; RV32I-NEXT:    lbu t0, 3(a0)
-; RV32I-NEXT:    lbu t1, 4(a0)
-; RV32I-NEXT:    lbu t2, 5(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t4, 7(a0)
-; RV32I-NEXT:    lbu t5, 8(a0)
-; RV32I-NEXT:    lbu t6, 9(a0)
-; RV32I-NEXT:    lbu s0, 10(a0)
-; RV32I-NEXT:    lbu s1, 1(a1)
-; RV32I-NEXT:    lbu s2, 0(a1)
-; RV32I-NEXT:    lbu s3, 11(a0)
-; RV32I-NEXT:    lbu s4, 12(a0)
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    or s1, s1, s2
-; RV32I-NEXT:    lbu s2, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    lbu s5, 13(a0)
-; RV32I-NEXT:    lbu a0, 14(a0)
-; RV32I-NEXT:    slli s2, s2, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, s2
-; RV32I-NEXT:    or a1, a1, s1
-; RV32I-NEXT:    sb a3, 23(sp)
-; RV32I-NEXT:    sb a0, 22(sp)
-; RV32I-NEXT:    sb s5, 21(sp)
-; RV32I-NEXT:    sb s4, 20(sp)
-; RV32I-NEXT:    sb s3, 19(sp)
-; RV32I-NEXT:    sb s0, 18(sp)
-; RV32I-NEXT:    sb t6, 17(sp)
-; RV32I-NEXT:    sb t5, 16(sp)
-; RV32I-NEXT:    sb t4, 15(sp)
-; RV32I-NEXT:    sb t3, 14(sp)
-; RV32I-NEXT:    sb t2, 13(sp)
-; RV32I-NEXT:    sb t1, 12(sp)
-; RV32I-NEXT:    sb t0, 11(sp)
-; RV32I-NEXT:    sb a7, 10(sp)
-; RV32I-NEXT:    sb a6, 9(sp)
-; RV32I-NEXT:    sb a5, 8(sp)
-; RV32I-NEXT:    srai a4, a4, 31
-; RV32I-NEXT:    sb a4, 36(sp)
-; RV32I-NEXT:    sb a4, 32(sp)
-; RV32I-NEXT:    sb a4, 28(sp)
-; RV32I-NEXT:    sb a4, 24(sp)
-; RV32I-NEXT:    srli a0, a4, 24
-; RV32I-NEXT:    sb a0, 39(sp)
-; RV32I-NEXT:    srli a3, a4, 16
-; RV32I-NEXT:    sb a3, 38(sp)
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 37(sp)
-; RV32I-NEXT:    sb a0, 35(sp)
-; RV32I-NEXT:    sb a3, 34(sp)
-; RV32I-NEXT:    sb a4, 33(sp)
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    sb a3, 30(sp)
-; RV32I-NEXT:    sb a4, 29(sp)
-; RV32I-NEXT:    sb a0, 27(sp)
-; RV32I-NEXT:    sb a3, 26(sp)
-; RV32I-NEXT:    sb a4, 25(sp)
-; RV32I-NEXT:    slli a0, a1, 25
-; RV32I-NEXT:    srli a0, a0, 28
-; RV32I-NEXT:    addi a3, sp, 8
-; RV32I-NEXT:    add a3, a3, a0
-; RV32I-NEXT:    lbu a0, 5(a3)
-; RV32I-NEXT:    lbu a4, 4(a3)
-; RV32I-NEXT:    lbu a5, 6(a3)
-; RV32I-NEXT:    lbu a6, 7(a3)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, a4, a0
-; RV32I-NEXT:    andi a4, a1, 7
-; RV32I-NEXT:    srl a0, a5, a4
-; RV32I-NEXT:    lbu a1, 9(a3)
-; RV32I-NEXT:    lbu a6, 8(a3)
-; RV32I-NEXT:    lbu a7, 10(a3)
-; RV32I-NEXT:    lbu t0, 11(a3)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
 ; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a6, a6, a1
-; RV32I-NEXT:    slli a1, a6, 1
-; RV32I-NEXT:    not a7, a4
-; RV32I-NEXT:    sll a1, a1, a7
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    lbu a7, 1(a3)
-; RV32I-NEXT:    lbu t0, 0(a3)
-; RV32I-NEXT:    lbu t1, 2(a3)
-; RV32I-NEXT:    lbu t2, 3(a3)
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a7, a0, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    lbu t0, 0(a1)
+; RV32I-NEXT:    lbu t1, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
 ; RV32I-NEXT:    slli a7, a7, 8
 ; RV32I-NEXT:    or a7, a7, t0
 ; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    srl a7, a7, a4
-; RV32I-NEXT:    slli a5, a5, 1
-; RV32I-NEXT:    xori t0, a4, 31
-; RV32I-NEXT:    sll a5, a5, t0
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    srl a6, a6, a4
-; RV32I-NEXT:    lbu t1, 13(a3)
-; RV32I-NEXT:    lbu t2, 12(a3)
-; RV32I-NEXT:    lbu t3, 14(a3)
-; RV32I-NEXT:    lbu a3, 15(a3)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    or a3, a3, t1
-; RV32I-NEXT:    slli t1, a3, 1
-; RV32I-NEXT:    sll t0, t1, t0
-; RV32I-NEXT:    or t0, a6, t0
-; RV32I-NEXT:    sra a3, a3, a4
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    srli a4, a6, 16
-; RV32I-NEXT:    sb a4, 10(a2)
-; RV32I-NEXT:    srli a4, a6, 8
-; RV32I-NEXT:    sb a4, 9(a2)
-; RV32I-NEXT:    srli a4, a3, 16
-; RV32I-NEXT:    sb a4, 14(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 13(a2)
-; RV32I-NEXT:    srli a3, a7, 16
-; RV32I-NEXT:    sb a3, 2(a2)
-; RV32I-NEXT:    srli a3, a7, 8
-; RV32I-NEXT:    sb a3, 1(a2)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 6(a2)
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t1
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    srai a0, a0, 31
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw a0, 24(sp)
+; RV32I-NEXT:    sw a0, 20(sp)
+; RV32I-NEXT:    sw a0, 16(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    srli a0, a1, 3
+; RV32I-NEXT:    andi a0, a0, 12
+; RV32I-NEXT:    mv a3, sp
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    lw a3, 4(a0)
+; RV32I-NEXT:    srl a4, a3, a1
+; RV32I-NEXT:    lw a5, 8(a0)
+; RV32I-NEXT:    andi a6, a1, 31
+; RV32I-NEXT:    xori a6, a6, 31
+; RV32I-NEXT:    lw a7, 0(a0)
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or a4, a4, t0
+; RV32I-NEXT:    srl a7, a7, a1
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    sll a3, a3, a6
+; RV32I-NEXT:    or a3, a7, a3
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    slli a7, a0, 1
+; RV32I-NEXT:    sll a6, a7, a6
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    sra a0, a0, a1
+; RV32I-NEXT:    sb a0, 12(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 15(a2)
 ; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 5(a2)
-; RV32I-NEXT:    srli a0, t0, 24
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a3, 0(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    srli a0, a5, 16
+; RV32I-NEXT:    sb a0, 10(a2)
+; RV32I-NEXT:    srli a0, a5, 24
 ; RV32I-NEXT:    sb a0, 11(a2)
-; RV32I-NEXT:    srli a5, a5, 24
-; RV32I-NEXT:    sb a5, 3(a2)
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    sb a0, 2(a2)
+; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    sb a0, 3(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a0, a4, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, a4, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1446,191 +1299,43 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -224
-; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -64
 ; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t1, 6(a0)
-; RV64I-NEXT:    lbu t2, 7(a0)
-; RV64I-NEXT:    lbu t3, 8(a0)
-; RV64I-NEXT:    lbu t4, 9(a0)
-; RV64I-NEXT:    lbu t5, 10(a0)
-; RV64I-NEXT:    lbu t6, 11(a0)
-; RV64I-NEXT:    lbu s0, 12(a0)
-; RV64I-NEXT:    lbu s1, 13(a0)
-; RV64I-NEXT:    lbu s2, 14(a0)
-; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    lbu s4, 16(a0)
-; RV64I-NEXT:    lbu s5, 17(a0)
-; RV64I-NEXT:    lbu s6, 18(a0)
-; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 1(a1)
-; RV64I-NEXT:    lbu s10, 0(a1)
-; RV64I-NEXT:    lbu s11, 2(a1)
-; RV64I-NEXT:    lbu ra, 3(a1)
-; RV64I-NEXT:    slli s9, s9, 8
-; RV64I-NEXT:    or s9, s9, s10
-; RV64I-NEXT:    slli s11, s11, 16
-; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    lbu s10, 5(a1)
-; RV64I-NEXT:    or s11, ra, s11
-; RV64I-NEXT:    or s11, s11, s9
-; RV64I-NEXT:    lbu s9, 4(a1)
-; RV64I-NEXT:    slli s10, s10, 8
-; RV64I-NEXT:    lbu ra, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    or s10, s10, s9
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    slli ra, ra, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, ra
-; RV64I-NEXT:    lbu ra, 22(a0)
-; RV64I-NEXT:    or a1, a1, s10
-; RV64I-NEXT:    lbu s10, 23(a0)
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or t0, a1, s11
-; RV64I-NEXT:    lbu s11, 24(a0)
-; RV64I-NEXT:    lbu a7, 25(a0)
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu a5, 27(a0)
-; RV64I-NEXT:    lbu a1, 31(a0)
-; RV64I-NEXT:    lbu a3, 30(a0)
-; RV64I-NEXT:    lbu a4, 29(a0)
-; RV64I-NEXT:    lbu a0, 28(a0)
-; RV64I-NEXT:    sb a1, 87(sp)
-; RV64I-NEXT:    sb a3, 86(sp)
-; RV64I-NEXT:    sb a4, 85(sp)
-; RV64I-NEXT:    sb a0, 84(sp)
-; RV64I-NEXT:    sb a5, 83(sp)
-; RV64I-NEXT:    sb a6, 82(sp)
-; RV64I-NEXT:    sb a7, 81(sp)
-; RV64I-NEXT:    sb s11, 80(sp)
-; RV64I-NEXT:    sb s10, 79(sp)
-; RV64I-NEXT:    sb ra, 78(sp)
-; RV64I-NEXT:    sb s9, 77(sp)
-; RV64I-NEXT:    sb s8, 76(sp)
-; RV64I-NEXT:    sb s7, 75(sp)
-; RV64I-NEXT:    sb s6, 74(sp)
-; RV64I-NEXT:    sb s5, 73(sp)
-; RV64I-NEXT:    sb s4, 72(sp)
-; RV64I-NEXT:    sb s3, 71(sp)
-; RV64I-NEXT:    sb s2, 70(sp)
-; RV64I-NEXT:    sb s1, 69(sp)
-; RV64I-NEXT:    sb s0, 68(sp)
-; RV64I-NEXT:    sb t6, 67(sp)
-; RV64I-NEXT:    sb t5, 66(sp)
-; RV64I-NEXT:    sb t4, 65(sp)
-; RV64I-NEXT:    sb zero, 119(sp)
-; RV64I-NEXT:    sb zero, 118(sp)
-; RV64I-NEXT:    sb zero, 117(sp)
-; RV64I-NEXT:    sb zero, 116(sp)
-; RV64I-NEXT:    sb zero, 115(sp)
-; RV64I-NEXT:    sb zero, 114(sp)
-; RV64I-NEXT:    sb zero, 113(sp)
-; RV64I-NEXT:    sb zero, 112(sp)
-; RV64I-NEXT:    sb zero, 111(sp)
-; RV64I-NEXT:    sb zero, 110(sp)
-; RV64I-NEXT:    sb zero, 109(sp)
-; RV64I-NEXT:    sb zero, 108(sp)
-; RV64I-NEXT:    sb zero, 107(sp)
-; RV64I-NEXT:    sb zero, 106(sp)
-; RV64I-NEXT:    sb zero, 105(sp)
-; RV64I-NEXT:    sb zero, 104(sp)
-; RV64I-NEXT:    sb zero, 103(sp)
-; RV64I-NEXT:    sb zero, 102(sp)
-; RV64I-NEXT:    sb zero, 101(sp)
-; RV64I-NEXT:    sb zero, 100(sp)
-; RV64I-NEXT:    sb zero, 99(sp)
-; RV64I-NEXT:    sb zero, 98(sp)
-; RV64I-NEXT:    sb zero, 97(sp)
-; RV64I-NEXT:    sb zero, 96(sp)
-; RV64I-NEXT:    sb zero, 95(sp)
-; RV64I-NEXT:    sb zero, 94(sp)
-; RV64I-NEXT:    sb zero, 93(sp)
-; RV64I-NEXT:    sb zero, 92(sp)
-; RV64I-NEXT:    sb zero, 91(sp)
-; RV64I-NEXT:    sb zero, 90(sp)
-; RV64I-NEXT:    sb zero, 89(sp)
-; RV64I-NEXT:    sb zero, 88(sp)
-; RV64I-NEXT:    sb t3, 64(sp)
-; RV64I-NEXT:    sb t2, 63(sp)
-; RV64I-NEXT:    sb t1, 62(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 61(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 60(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 59(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 58(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 57(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    slli a0, t0, 56
-; RV64I-NEXT:    srli a0, a0, 59
-; RV64I-NEXT:    addi a3, sp, 56
-; RV64I-NEXT:    add a3, a3, a0
-; RV64I-NEXT:    lbu a0, 9(a3)
-; RV64I-NEXT:    lbu a1, 8(a3)
-; RV64I-NEXT:    lbu a4, 10(a3)
-; RV64I-NEXT:    lbu a5, 11(a3)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    or a0, a4, a0
-; RV64I-NEXT:    lbu a1, 13(a3)
-; RV64I-NEXT:    lbu a4, 12(a3)
-; RV64I-NEXT:    lbu a5, 14(a3)
-; RV64I-NEXT:    lbu a6, 15(a3)
-; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a1, a4, a1
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a4, a1, a0
-; RV64I-NEXT:    andi a1, t0, 7
-; RV64I-NEXT:    lbu a0, 17(a3)
-; RV64I-NEXT:    lbu a5, 16(a3)
-; RV64I-NEXT:    lbu a6, 18(a3)
-; RV64I-NEXT:    lbu a7, 19(a3)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a0, a5, a0
-; RV64I-NEXT:    lbu a5, 21(a3)
-; RV64I-NEXT:    lbu a6, 20(a3)
-; RV64I-NEXT:    lbu a7, 22(a3)
-; RV64I-NEXT:    lbu t0, 23(a3)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
@@ -1638,92 +1343,138 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a5, a5, a0
-; RV64I-NEXT:    slli a0, a5, 1
-; RV64I-NEXT:    not a6, a1
-; RV64I-NEXT:    sll a0, a0, a6
-; RV64I-NEXT:    lbu a6, 1(a3)
-; RV64I-NEXT:    lbu a7, 0(a3)
-; RV64I-NEXT:    lbu t0, 2(a3)
-; RV64I-NEXT:    lbu t1, 3(a3)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a3)
-; RV64I-NEXT:    lbu t0, 4(a3)
-; RV64I-NEXT:    lbu t1, 6(a3)
-; RV64I-NEXT:    lbu t2, 7(a3)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 25(a3)
-; RV64I-NEXT:    lbu t0, 24(a3)
-; RV64I-NEXT:    lbu t1, 26(a3)
-; RV64I-NEXT:    lbu t2, 27(a3)
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    or a7, a7, t0
 ; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    lbu t0, 29(a3)
-; RV64I-NEXT:    lbu t1, 28(a3)
-; RV64I-NEXT:    lbu t2, 30(a3)
-; RV64I-NEXT:    lbu a3, 31(a3)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or t0, t0, t1
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    slli a3, a3, 24
-; RV64I-NEXT:    or a3, a3, t2
-; RV64I-NEXT:    slli t1, a4, 1
-; RV64I-NEXT:    or a3, a3, t0
-; RV64I-NEXT:    xori t0, a1, 63
-; RV64I-NEXT:    sll t1, t1, t0
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a7, a3, a7
-; RV64I-NEXT:    slli a3, a7, 1
-; RV64I-NEXT:    sll t0, a3, t0
-; RV64I-NEXT:    srl a3, a4, a1
-; RV64I-NEXT:    srl a4, a6, a1
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t1, 3(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    sd zero, 56(sp)
+; RV64I-NEXT:    sd zero, 48(sp)
+; RV64I-NEXT:    sd zero, 40(sp)
+; RV64I-NEXT:    sd zero, 32(sp)
+; RV64I-NEXT:    sd a0, 24(sp)
+; RV64I-NEXT:    sd a5, 16(sp)
+; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    srli a0, a1, 3
+; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    mv a3, sp
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    srl a0, a4, a1
+; RV64I-NEXT:    ld a5, 16(a3)
+; RV64I-NEXT:    andi a6, a1, 63
+; RV64I-NEXT:    xori a6, a6, 63
+; RV64I-NEXT:    ld a7, 0(a3)
+; RV64I-NEXT:    slli t0, a5, 1
+; RV64I-NEXT:    sll t0, t0, a6
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    srl a7, a7, a1
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    ld a3, 24(a3)
+; RV64I-NEXT:    sll a4, a4, a6
+; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    srl a5, a5, a1
-; RV64I-NEXT:    srl a1, a7, a1
-; RV64I-NEXT:    srli a6, a5, 48
-; RV64I-NEXT:    sb a6, 22(a2)
-; RV64I-NEXT:    srli a6, a5, 40
-; RV64I-NEXT:    sb a6, 21(a2)
-; RV64I-NEXT:    srli a6, a5, 32
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    srli a6, a5, 24
-; RV64I-NEXT:    sb a6, 19(a2)
-; RV64I-NEXT:    srli a6, a5, 16
-; RV64I-NEXT:    sb a6, 18(a2)
-; RV64I-NEXT:    or a6, a5, t0
-; RV64I-NEXT:    sb a5, 16(a2)
-; RV64I-NEXT:    srli a5, a5, 8
-; RV64I-NEXT:    sb a5, 17(a2)
-; RV64I-NEXT:    srli a5, a1, 56
-; RV64I-NEXT:    sb a5, 31(a2)
-; RV64I-NEXT:    srli a5, a1, 48
-; RV64I-NEXT:    sb a5, 30(a2)
-; RV64I-NEXT:    srli a5, a1, 40
-; RV64I-NEXT:    sb a5, 29(a2)
-; RV64I-NEXT:    srli a5, a1, 32
-; RV64I-NEXT:    sb a5, 28(a2)
-; RV64I-NEXT:    srli a5, a1, 24
-; RV64I-NEXT:    sb a5, 27(a2)
-; RV64I-NEXT:    srli a5, a1, 16
-; RV64I-NEXT:    sb a5, 26(a2)
+; RV64I-NEXT:    slli a7, a3, 1
+; RV64I-NEXT:    sll a6, a7, a6
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    srl a1, a3, a1
 ; RV64I-NEXT:    sb a1, 24(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 31(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 30(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 29(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 28(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 27(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 26(a2)
 ; RV64I-NEXT:    srli a1, a1, 8
 ; RV64I-NEXT:    sb a1, 25(a2)
+; RV64I-NEXT:    sb a5, 16(a2)
+; RV64I-NEXT:    sb a4, 0(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a1, a5, 56
+; RV64I-NEXT:    sb a1, 23(a2)
+; RV64I-NEXT:    srli a1, a5, 48
+; RV64I-NEXT:    sb a1, 22(a2)
+; RV64I-NEXT:    srli a1, a5, 40
+; RV64I-NEXT:    sb a1, 21(a2)
+; RV64I-NEXT:    srli a1, a5, 32
+; RV64I-NEXT:    sb a1, 20(a2)
+; RV64I-NEXT:    srli a1, a5, 24
+; RV64I-NEXT:    sb a1, 19(a2)
+; RV64I-NEXT:    srli a1, a5, 16
+; RV64I-NEXT:    sb a1, 18(a2)
+; RV64I-NEXT:    srli a5, a5, 8
+; RV64I-NEXT:    sb a5, 17(a2)
+; RV64I-NEXT:    srli a1, a4, 56
+; RV64I-NEXT:    sb a1, 7(a2)
 ; RV64I-NEXT:    srli a1, a4, 48
 ; RV64I-NEXT:    sb a1, 6(a2)
 ; RV64I-NEXT:    srli a1, a4, 40
@@ -1734,366 +1485,234 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a1, 3(a2)
 ; RV64I-NEXT:    srli a1, a4, 16
 ; RV64I-NEXT:    sb a1, 2(a2)
-; RV64I-NEXT:    or a1, a4, t1
-; RV64I-NEXT:    sb a4, 0(a2)
 ; RV64I-NEXT:    srli a4, a4, 8
 ; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    srli a4, a3, 48
-; RV64I-NEXT:    sb a4, 14(a2)
-; RV64I-NEXT:    srli a4, a3, 40
-; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    srli a4, a3, 32
-; RV64I-NEXT:    sb a4, 12(a2)
-; RV64I-NEXT:    srli a4, a3, 24
-; RV64I-NEXT:    sb a4, 11(a2)
-; RV64I-NEXT:    srli a4, a3, 16
-; RV64I-NEXT:    sb a4, 10(a2)
-; RV64I-NEXT:    or a0, a3, a0
-; RV64I-NEXT:    sb a3, 8(a2)
-; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 9(a2)
-; RV64I-NEXT:    srli a3, a6, 56
-; RV64I-NEXT:    sb a3, 23(a2)
-; RV64I-NEXT:    srli a1, a1, 56
-; RV64I-NEXT:    sb a1, 7(a2)
-; RV64I-NEXT:    srli a0, a0, 56
-; RV64I-NEXT:    sb a0, 15(a2)
-; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 224
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    addi sp, sp, 64
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -144
-; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -64
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
-; RV32I-NEXT:    lbu s3, 15(a0)
-; RV32I-NEXT:    lbu s4, 16(a0)
-; RV32I-NEXT:    lbu s5, 17(a0)
-; RV32I-NEXT:    lbu s6, 18(a0)
-; RV32I-NEXT:    lbu s7, 19(a0)
-; RV32I-NEXT:    lbu s10, 1(a1)
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s11, 0(a1)
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    lbu ra, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    or s10, s10, s11
-; RV32I-NEXT:    lbu s11, 22(a0)
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, ra
-; RV32I-NEXT:    lbu ra, 23(a0)
-; RV32I-NEXT:    or t0, a1, s10
-; RV32I-NEXT:    lbu s10, 24(a0)
-; RV32I-NEXT:    lbu a7, 25(a0)
-; RV32I-NEXT:    lbu a6, 26(a0)
-; RV32I-NEXT:    lbu a5, 27(a0)
-; RV32I-NEXT:    lbu a1, 31(a0)
-; RV32I-NEXT:    lbu a3, 30(a0)
-; RV32I-NEXT:    lbu a4, 29(a0)
-; RV32I-NEXT:    lbu a0, 28(a0)
-; RV32I-NEXT:    sb a1, 59(sp)
-; RV32I-NEXT:    sb a3, 58(sp)
-; RV32I-NEXT:    sb a4, 57(sp)
-; RV32I-NEXT:    sb a0, 56(sp)
-; RV32I-NEXT:    sb a5, 55(sp)
-; RV32I-NEXT:    sb a6, 54(sp)
-; RV32I-NEXT:    sb a7, 53(sp)
-; RV32I-NEXT:    sb s10, 52(sp)
-; RV32I-NEXT:    sb ra, 51(sp)
-; RV32I-NEXT:    sb s11, 50(sp)
-; RV32I-NEXT:    sb s9, 49(sp)
-; RV32I-NEXT:    sb s8, 48(sp)
-; RV32I-NEXT:    sb s7, 47(sp)
-; RV32I-NEXT:    sb s6, 46(sp)
-; RV32I-NEXT:    sb s5, 45(sp)
-; RV32I-NEXT:    sb s4, 44(sp)
-; RV32I-NEXT:    sb zero, 91(sp)
-; RV32I-NEXT:    sb zero, 90(sp)
-; RV32I-NEXT:    sb zero, 89(sp)
-; RV32I-NEXT:    sb zero, 88(sp)
-; RV32I-NEXT:    sb zero, 87(sp)
-; RV32I-NEXT:    sb zero, 86(sp)
-; RV32I-NEXT:    sb zero, 85(sp)
-; RV32I-NEXT:    sb zero, 84(sp)
-; RV32I-NEXT:    sb zero, 83(sp)
-; RV32I-NEXT:    sb zero, 82(sp)
-; RV32I-NEXT:    sb zero, 81(sp)
-; RV32I-NEXT:    sb zero, 80(sp)
-; RV32I-NEXT:    sb zero, 79(sp)
-; RV32I-NEXT:    sb zero, 78(sp)
-; RV32I-NEXT:    sb zero, 77(sp)
-; RV32I-NEXT:    sb zero, 76(sp)
-; RV32I-NEXT:    sb zero, 75(sp)
-; RV32I-NEXT:    sb zero, 74(sp)
-; RV32I-NEXT:    sb zero, 73(sp)
-; RV32I-NEXT:    sb zero, 72(sp)
-; RV32I-NEXT:    sb zero, 71(sp)
-; RV32I-NEXT:    sb zero, 70(sp)
-; RV32I-NEXT:    sb zero, 69(sp)
-; RV32I-NEXT:    sb zero, 68(sp)
-; RV32I-NEXT:    sb zero, 67(sp)
-; RV32I-NEXT:    sb zero, 66(sp)
-; RV32I-NEXT:    sb zero, 65(sp)
-; RV32I-NEXT:    sb zero, 64(sp)
-; RV32I-NEXT:    sb zero, 63(sp)
-; RV32I-NEXT:    sb zero, 62(sp)
-; RV32I-NEXT:    sb zero, 61(sp)
-; RV32I-NEXT:    sb zero, 60(sp)
-; RV32I-NEXT:    sb s3, 43(sp)
-; RV32I-NEXT:    sb s2, 42(sp)
-; RV32I-NEXT:    sb s1, 41(sp)
-; RV32I-NEXT:    sb s0, 40(sp)
-; RV32I-NEXT:    sb t6, 39(sp)
-; RV32I-NEXT:    sb t5, 38(sp)
-; RV32I-NEXT:    sb t4, 37(sp)
-; RV32I-NEXT:    sb t3, 36(sp)
-; RV32I-NEXT:    sb t2, 35(sp)
-; RV32I-NEXT:    sb t1, 34(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    slli a0, t0, 24
-; RV32I-NEXT:    srli a0, a0, 27
-; RV32I-NEXT:    addi a4, sp, 28
-; RV32I-NEXT:    add a4, a4, a0
-; RV32I-NEXT:    lbu a0, 5(a4)
-; RV32I-NEXT:    lbu a1, 4(a4)
-; RV32I-NEXT:    lbu a3, 6(a4)
-; RV32I-NEXT:    lbu a5, 7(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a3, a3, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or t5, a3, a0
-; RV32I-NEXT:    andi a3, t0, 7
-; RV32I-NEXT:    lbu a0, 9(a4)
-; RV32I-NEXT:    lbu a1, 8(a4)
-; RV32I-NEXT:    lbu a5, 10(a4)
-; RV32I-NEXT:    lbu a6, 11(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a1, a6, a5
-; RV32I-NEXT:    or a6, a1, a0
-; RV32I-NEXT:    slli a0, a6, 1
-; RV32I-NEXT:    not t1, a3
-; RV32I-NEXT:    sll a0, a0, t1
-; RV32I-NEXT:    lbu a1, 1(a4)
-; RV32I-NEXT:    lbu a5, 0(a4)
-; RV32I-NEXT:    lbu a7, 2(a4)
-; RV32I-NEXT:    lbu t0, 3(a4)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or t0, a5, a1
-; RV32I-NEXT:    slli a1, t5, 1
-; RV32I-NEXT:    xori t2, a3, 31
-; RV32I-NEXT:    sll a1, a1, t2
-; RV32I-NEXT:    lbu a5, 13(a4)
-; RV32I-NEXT:    lbu a7, 12(a4)
-; RV32I-NEXT:    lbu t3, 14(a4)
-; RV32I-NEXT:    lbu t4, 15(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or t0, t0, a7
+; RV32I-NEXT:    lbu a7, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t1, t1, a7
+; RV32I-NEXT:    lbu a7, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t2
 ; RV32I-NEXT:    slli t3, t3, 16
 ; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t3, a7, a5
-; RV32I-NEXT:    lbu a5, 17(a4)
-; RV32I-NEXT:    lbu a7, 16(a4)
-; RV32I-NEXT:    lbu t4, 18(a4)
-; RV32I-NEXT:    lbu t6, 19(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t2, t2, a7
+; RV32I-NEXT:    lbu a7, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t3
 ; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a7, t6, t4
-; RV32I-NEXT:    or t4, a7, a5
-; RV32I-NEXT:    slli a5, t4, 1
-; RV32I-NEXT:    sll a7, a5, t1
-; RV32I-NEXT:    lbu a5, 21(a4)
-; RV32I-NEXT:    lbu t6, 20(a4)
-; RV32I-NEXT:    lbu s0, 22(a4)
-; RV32I-NEXT:    lbu s1, 23(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or s0, s0, a5
-; RV32I-NEXT:    lbu a5, 25(a4)
-; RV32I-NEXT:    lbu t6, 24(a4)
-; RV32I-NEXT:    lbu s1, 26(a4)
-; RV32I-NEXT:    lbu s2, 27(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or t6, s2, s1
-; RV32I-NEXT:    or t6, t6, a5
-; RV32I-NEXT:    lbu a5, 29(a4)
-; RV32I-NEXT:    lbu s1, 28(a4)
-; RV32I-NEXT:    slli s2, t6, 1
-; RV32I-NEXT:    sll t1, s2, t1
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, s1
-; RV32I-NEXT:    lbu s1, 30(a4)
-; RV32I-NEXT:    lbu a4, 31(a4)
-; RV32I-NEXT:    slli s2, t3, 1
-; RV32I-NEXT:    sll s2, s2, t2
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    slli s1, s0, 1
-; RV32I-NEXT:    sll s1, s1, t2
-; RV32I-NEXT:    or s3, a4, a5
-; RV32I-NEXT:    slli a4, s3, 1
-; RV32I-NEXT:    sll t2, a4, t2
-; RV32I-NEXT:    srl a4, t5, a3
-; RV32I-NEXT:    srl a5, t0, a3
-; RV32I-NEXT:    srl t0, t3, a3
-; RV32I-NEXT:    srl a6, a6, a3
-; RV32I-NEXT:    srl t3, s0, a3
-; RV32I-NEXT:    srl t4, t4, a3
-; RV32I-NEXT:    srl t5, t6, a3
-; RV32I-NEXT:    srl a3, s3, a3
-; RV32I-NEXT:    srli t6, t5, 16
-; RV32I-NEXT:    sb t6, 26(a2)
-; RV32I-NEXT:    or t2, t5, t2
-; RV32I-NEXT:    sb t5, 24(a2)
-; RV32I-NEXT:    srli t5, t5, 8
-; RV32I-NEXT:    sb t5, 25(a2)
-; RV32I-NEXT:    srli t5, a3, 24
-; RV32I-NEXT:    sb t5, 31(a2)
-; RV32I-NEXT:    srli t5, a3, 16
-; RV32I-NEXT:    sb t5, 30(a2)
-; RV32I-NEXT:    sb a3, 28(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 29(a2)
-; RV32I-NEXT:    srli a3, t4, 16
-; RV32I-NEXT:    sb a3, 18(a2)
-; RV32I-NEXT:    or a3, t4, s1
-; RV32I-NEXT:    sb t4, 16(a2)
-; RV32I-NEXT:    srli t4, t4, 8
-; RV32I-NEXT:    sb t4, 17(a2)
-; RV32I-NEXT:    srli t4, t3, 16
-; RV32I-NEXT:    sb t4, 22(a2)
-; RV32I-NEXT:    or t1, t3, t1
-; RV32I-NEXT:    sb t3, 20(a2)
-; RV32I-NEXT:    srli t3, t3, 8
-; RV32I-NEXT:    sb t3, 21(a2)
-; RV32I-NEXT:    srli t3, a6, 16
-; RV32I-NEXT:    sb t3, 10(a2)
-; RV32I-NEXT:    or t3, a6, s2
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    srli a6, a6, 8
-; RV32I-NEXT:    sb a6, 9(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    sb a6, 14(a2)
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    sb t0, 12(a2)
-; RV32I-NEXT:    srli a7, t0, 8
-; RV32I-NEXT:    sb a7, 13(a2)
-; RV32I-NEXT:    srli a7, a5, 16
-; RV32I-NEXT:    sb a7, 2(a2)
-; RV32I-NEXT:    or a1, a5, a1
-; RV32I-NEXT:    sb a5, 0(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    srli a5, a4, 16
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or a0, a0, a7
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    lbu t3, 0(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    or a7, a1, a7
+; RV32I-NEXT:    sw zero, 60(sp)
+; RV32I-NEXT:    sw zero, 56(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 44(sp)
+; RV32I-NEXT:    sw zero, 40(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    sw t2, 24(sp)
+; RV32I-NEXT:    sw t1, 20(sp)
+; RV32I-NEXT:    sw t0, 16(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    srli a0, a7, 3
+; RV32I-NEXT:    andi a0, a0, 28
+; RV32I-NEXT:    mv a1, sp
+; RV32I-NEXT:    add a4, a1, a0
+; RV32I-NEXT:    lw a1, 4(a4)
+; RV32I-NEXT:    srl a0, a1, a7
+; RV32I-NEXT:    lw a5, 8(a4)
+; RV32I-NEXT:    andi a3, a7, 31
+; RV32I-NEXT:    xori a6, a3, 31
+; RV32I-NEXT:    lw a3, 0(a4)
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    srl a3, a3, a7
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    lw t0, 12(a4)
+; RV32I-NEXT:    lw t1, 16(a4)
+; RV32I-NEXT:    sll a1, a1, a6
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    srl a3, t0, a7
+; RV32I-NEXT:    slli t2, t1, 1
+; RV32I-NEXT:    sll t2, t2, a6
+; RV32I-NEXT:    or a3, a3, t2
+; RV32I-NEXT:    srl a5, a5, a7
+; RV32I-NEXT:    slli t0, t0, 1
+; RV32I-NEXT:    lw t2, 20(a4)
+; RV32I-NEXT:    lw t3, 24(a4)
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or a5, a5, t0
+; RV32I-NEXT:    srl t0, t2, a7
+; RV32I-NEXT:    slli t4, t3, 1
+; RV32I-NEXT:    sll t4, t4, a6
+; RV32I-NEXT:    or t0, t0, t4
+; RV32I-NEXT:    srl t1, t1, a7
+; RV32I-NEXT:    slli t2, t2, 1
+; RV32I-NEXT:    lw a4, 28(a4)
+; RV32I-NEXT:    sll t2, t2, a6
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    srl t2, t3, a7
+; RV32I-NEXT:    slli t3, a4, 1
+; RV32I-NEXT:    sll a6, t3, a6
+; RV32I-NEXT:    or a6, t2, a6
+; RV32I-NEXT:    srl a4, a4, a7
+; RV32I-NEXT:    sb a4, 28(a2)
+; RV32I-NEXT:    srli a7, a4, 24
+; RV32I-NEXT:    sb a7, 31(a2)
+; RV32I-NEXT:    srli a7, a4, 16
+; RV32I-NEXT:    sb a7, 30(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    srli a4, t2, 24
+; RV32I-NEXT:    sb a4, 29(a2)
+; RV32I-NEXT:    sb a6, 24(a2)
+; RV32I-NEXT:    sb t1, 16(a2)
+; RV32I-NEXT:    sb t0, 20(a2)
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a3, 12(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli a4, a6, 24
 ; RV32I-NEXT:    sb a4, 27(a2)
-; RV32I-NEXT:    srli a3, a3, 24
-; RV32I-NEXT:    sb a3, 19(a2)
-; RV32I-NEXT:    srli a3, t1, 24
-; RV32I-NEXT:    sb a3, 23(a2)
-; RV32I-NEXT:    srli a3, t3, 24
-; RV32I-NEXT:    sb a3, 11(a2)
-; RV32I-NEXT:    srli a3, a6, 24
-; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sb a1, 3(a2)
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 144
+; RV32I-NEXT:    srli a4, a6, 16
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    srli a4, a6, 8
+; RV32I-NEXT:    sb a4, 25(a2)
+; RV32I-NEXT:    srli a4, t1, 24
+; RV32I-NEXT:    sb a4, 19(a2)
+; RV32I-NEXT:    srli a4, t1, 16
+; RV32I-NEXT:    sb a4, 18(a2)
+; RV32I-NEXT:    srli a4, t1, 8
+; RV32I-NEXT:    sb a4, 17(a2)
+; RV32I-NEXT:    srli a4, t0, 24
+; RV32I-NEXT:    sb a4, 23(a2)
+; RV32I-NEXT:    srli a4, t0, 16
+; RV32I-NEXT:    sb a4, 22(a2)
+; RV32I-NEXT:    srli a4, t0, 8
+; RV32I-NEXT:    sb a4, 21(a2)
+; RV32I-NEXT:    srli a4, a5, 24
+; RV32I-NEXT:    sb a4, 11(a2)
+; RV32I-NEXT:    srli a4, a5, 16
+; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 15(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 14(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 64
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2104,191 +1723,43 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -224
-; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -64
 ; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t1, 6(a0)
-; RV64I-NEXT:    lbu t2, 7(a0)
-; RV64I-NEXT:    lbu t3, 8(a0)
-; RV64I-NEXT:    lbu t4, 9(a0)
-; RV64I-NEXT:    lbu t5, 10(a0)
-; RV64I-NEXT:    lbu t6, 11(a0)
-; RV64I-NEXT:    lbu s0, 12(a0)
-; RV64I-NEXT:    lbu s1, 13(a0)
-; RV64I-NEXT:    lbu s2, 14(a0)
-; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    lbu s4, 16(a0)
-; RV64I-NEXT:    lbu s5, 17(a0)
-; RV64I-NEXT:    lbu s6, 18(a0)
-; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 1(a1)
-; RV64I-NEXT:    lbu s10, 0(a1)
-; RV64I-NEXT:    lbu s11, 2(a1)
-; RV64I-NEXT:    lbu ra, 3(a1)
-; RV64I-NEXT:    slli s9, s9, 8
-; RV64I-NEXT:    or s9, s9, s10
-; RV64I-NEXT:    slli s11, s11, 16
-; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    lbu s10, 5(a1)
-; RV64I-NEXT:    or s11, ra, s11
-; RV64I-NEXT:    or s11, s11, s9
-; RV64I-NEXT:    lbu s9, 4(a1)
-; RV64I-NEXT:    slli s10, s10, 8
-; RV64I-NEXT:    lbu ra, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    or s10, s10, s9
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    slli ra, ra, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, ra
-; RV64I-NEXT:    lbu ra, 22(a0)
-; RV64I-NEXT:    or a1, a1, s10
-; RV64I-NEXT:    lbu s10, 23(a0)
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or t0, a1, s11
-; RV64I-NEXT:    lbu s11, 24(a0)
-; RV64I-NEXT:    lbu a7, 25(a0)
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu a5, 27(a0)
-; RV64I-NEXT:    lbu a1, 31(a0)
-; RV64I-NEXT:    lbu a3, 30(a0)
-; RV64I-NEXT:    lbu a4, 29(a0)
-; RV64I-NEXT:    lbu a0, 28(a0)
-; RV64I-NEXT:    sb a1, 119(sp)
-; RV64I-NEXT:    sb a3, 118(sp)
-; RV64I-NEXT:    sb a4, 117(sp)
-; RV64I-NEXT:    sb a0, 116(sp)
-; RV64I-NEXT:    sb a5, 115(sp)
-; RV64I-NEXT:    sb a6, 114(sp)
-; RV64I-NEXT:    sb a7, 113(sp)
-; RV64I-NEXT:    sb s11, 112(sp)
-; RV64I-NEXT:    sb s10, 111(sp)
-; RV64I-NEXT:    sb ra, 110(sp)
-; RV64I-NEXT:    sb s9, 109(sp)
-; RV64I-NEXT:    sb s8, 108(sp)
-; RV64I-NEXT:    sb s7, 107(sp)
-; RV64I-NEXT:    sb s6, 106(sp)
-; RV64I-NEXT:    sb s5, 105(sp)
-; RV64I-NEXT:    sb s4, 104(sp)
-; RV64I-NEXT:    sb s3, 103(sp)
-; RV64I-NEXT:    sb s2, 102(sp)
-; RV64I-NEXT:    sb s1, 101(sp)
-; RV64I-NEXT:    sb s0, 100(sp)
-; RV64I-NEXT:    sb t6, 99(sp)
-; RV64I-NEXT:    sb t5, 98(sp)
-; RV64I-NEXT:    sb t4, 97(sp)
-; RV64I-NEXT:    sb t3, 96(sp)
-; RV64I-NEXT:    sb zero, 87(sp)
-; RV64I-NEXT:    sb zero, 86(sp)
-; RV64I-NEXT:    sb zero, 85(sp)
-; RV64I-NEXT:    sb zero, 84(sp)
-; RV64I-NEXT:    sb zero, 83(sp)
-; RV64I-NEXT:    sb zero, 82(sp)
-; RV64I-NEXT:    sb zero, 81(sp)
-; RV64I-NEXT:    sb zero, 80(sp)
-; RV64I-NEXT:    sb zero, 79(sp)
-; RV64I-NEXT:    sb zero, 78(sp)
-; RV64I-NEXT:    sb zero, 77(sp)
-; RV64I-NEXT:    sb zero, 76(sp)
-; RV64I-NEXT:    sb zero, 75(sp)
-; RV64I-NEXT:    sb zero, 74(sp)
-; RV64I-NEXT:    sb zero, 73(sp)
-; RV64I-NEXT:    sb zero, 72(sp)
-; RV64I-NEXT:    sb zero, 71(sp)
-; RV64I-NEXT:    sb zero, 70(sp)
-; RV64I-NEXT:    sb zero, 69(sp)
-; RV64I-NEXT:    sb zero, 68(sp)
-; RV64I-NEXT:    sb zero, 67(sp)
-; RV64I-NEXT:    sb zero, 66(sp)
-; RV64I-NEXT:    sb zero, 65(sp)
-; RV64I-NEXT:    sb zero, 64(sp)
-; RV64I-NEXT:    sb zero, 63(sp)
-; RV64I-NEXT:    sb zero, 62(sp)
-; RV64I-NEXT:    sb zero, 61(sp)
-; RV64I-NEXT:    sb zero, 60(sp)
-; RV64I-NEXT:    sb zero, 59(sp)
-; RV64I-NEXT:    sb zero, 58(sp)
-; RV64I-NEXT:    sb zero, 57(sp)
-; RV64I-NEXT:    sb zero, 56(sp)
-; RV64I-NEXT:    sb t2, 95(sp)
-; RV64I-NEXT:    sb t1, 94(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 93(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 92(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 91(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 90(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 89(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 88(sp)
-; RV64I-NEXT:    slli a0, t0, 56
-; RV64I-NEXT:    srli a0, a0, 59
-; RV64I-NEXT:    addi a1, sp, 88
-; RV64I-NEXT:    sub a0, a1, a0
-; RV64I-NEXT:    lbu a1, 9(a0)
-; RV64I-NEXT:    lbu a3, 8(a0)
-; RV64I-NEXT:    lbu a4, 10(a0)
-; RV64I-NEXT:    lbu a5, 11(a0)
-; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    or a1, a4, a1
-; RV64I-NEXT:    lbu a3, 13(a0)
-; RV64I-NEXT:    lbu a4, 12(a0)
-; RV64I-NEXT:    lbu a5, 14(a0)
-; RV64I-NEXT:    lbu a6, 15(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a3, a3, a1
-; RV64I-NEXT:    andi a1, t0, 7
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a5, 0(a0)
-; RV64I-NEXT:    lbu a6, 2(a0)
-; RV64I-NEXT:    lbu a7, 3(a0)
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 5(a0)
-; RV64I-NEXT:    lbu a6, 4(a0)
-; RV64I-NEXT:    lbu a7, 6(a0)
-; RV64I-NEXT:    lbu t0, 7(a0)
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
@@ -2297,20 +1768,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a5, a5, 32
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 25(a0)
-; RV64I-NEXT:    lbu a6, 24(a0)
-; RV64I-NEXT:    lbu a7, 26(a0)
-; RV64I-NEXT:    lbu t0, 27(a0)
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 29(a0)
-; RV64I-NEXT:    lbu a7, 28(a0)
-; RV64I-NEXT:    lbu t0, 30(a0)
-; RV64I-NEXT:    lbu t1, 31(a0)
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
@@ -2319,439 +1790,353 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    slli a6, a6, 32
 ; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 17(a0)
-; RV64I-NEXT:    lbu a7, 16(a0)
-; RV64I-NEXT:    lbu t0, 18(a0)
-; RV64I-NEXT:    lbu t1, 19(a0)
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    lbu a7, 21(a0)
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    or a6, t0, a6
-; RV64I-NEXT:    lbu t0, 20(a0)
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    lbu t1, 22(a0)
-; RV64I-NEXT:    lbu a0, 23(a0)
 ; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    srli t0, a4, 1
 ; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or t1, a0, t1
-; RV64I-NEXT:    xori t2, a1, 63
-; RV64I-NEXT:    srl a0, t0, t2
-; RV64I-NEXT:    or a7, t1, a7
-; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t1, 3(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    srli a7, a6, 1
-; RV64I-NEXT:    srl a7, a7, t2
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu t0, 4(a1)
+; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    sd zero, 24(sp)
+; RV64I-NEXT:    sd zero, 16(sp)
+; RV64I-NEXT:    sd zero, 8(sp)
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd a0, 56(sp)
+; RV64I-NEXT:    sd a5, 48(sp)
+; RV64I-NEXT:    sd a4, 40(sp)
+; RV64I-NEXT:    sd a3, 32(sp)
+; RV64I-NEXT:    srli a0, a1, 3
+; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    addi a3, sp, 32
+; RV64I-NEXT:    sub a3, a3, a0
+; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    ld a5, 0(a3)
+; RV64I-NEXT:    sll a0, a4, a1
+; RV64I-NEXT:    andi a6, a1, 63
+; RV64I-NEXT:    xori a6, a6, 63
+; RV64I-NEXT:    srli a7, a5, 1
+; RV64I-NEXT:    ld t0, 24(a3)
+; RV64I-NEXT:    ld a3, 16(a3)
+; RV64I-NEXT:    srl a7, a7, a6
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    sll a7, t0, a1
 ; RV64I-NEXT:    srli t0, a3, 1
-; RV64I-NEXT:    not t1, a1
-; RV64I-NEXT:    srl t0, t0, t1
+; RV64I-NEXT:    srl t0, t0, a6
+; RV64I-NEXT:    or a7, a7, t0
 ; RV64I-NEXT:    sll a3, a3, a1
-; RV64I-NEXT:    sll a5, a5, a1
-; RV64I-NEXT:    sll a6, a6, a1
-; RV64I-NEXT:    sll a1, a4, a1
-; RV64I-NEXT:    srli a4, a6, 56
-; RV64I-NEXT:    sb a4, 23(a2)
-; RV64I-NEXT:    srli a4, a6, 48
-; RV64I-NEXT:    sb a4, 22(a2)
-; RV64I-NEXT:    srli a4, a6, 40
-; RV64I-NEXT:    sb a4, 21(a2)
-; RV64I-NEXT:    srli a4, a6, 32
-; RV64I-NEXT:    sb a4, 20(a2)
-; RV64I-NEXT:    srli a4, a6, 24
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    srli a4, a6, 16
-; RV64I-NEXT:    sb a4, 18(a2)
-; RV64I-NEXT:    or a4, a6, t0
-; RV64I-NEXT:    srli a6, a6, 8
-; RV64I-NEXT:    sb a6, 17(a2)
-; RV64I-NEXT:    srli a6, a5, 56
-; RV64I-NEXT:    sb a6, 31(a2)
-; RV64I-NEXT:    srli a6, a5, 48
-; RV64I-NEXT:    sb a6, 30(a2)
-; RV64I-NEXT:    srli a6, a5, 40
-; RV64I-NEXT:    sb a6, 29(a2)
-; RV64I-NEXT:    srli a6, a5, 32
-; RV64I-NEXT:    sb a6, 28(a2)
-; RV64I-NEXT:    srli a6, a5, 24
-; RV64I-NEXT:    sb a6, 27(a2)
-; RV64I-NEXT:    srli a6, a5, 16
-; RV64I-NEXT:    sb a6, 26(a2)
-; RV64I-NEXT:    or a6, a5, a7
-; RV64I-NEXT:    srli a5, a5, 8
-; RV64I-NEXT:    sb a5, 25(a2)
-; RV64I-NEXT:    srli a5, a1, 56
-; RV64I-NEXT:    sb a5, 7(a2)
-; RV64I-NEXT:    srli a5, a1, 48
-; RV64I-NEXT:    sb a5, 6(a2)
-; RV64I-NEXT:    srli a5, a1, 40
-; RV64I-NEXT:    sb a5, 5(a2)
-; RV64I-NEXT:    srli a5, a1, 32
-; RV64I-NEXT:    sb a5, 4(a2)
-; RV64I-NEXT:    srli a5, a1, 24
-; RV64I-NEXT:    sb a5, 3(a2)
-; RV64I-NEXT:    srli a5, a1, 16
-; RV64I-NEXT:    sb a5, 2(a2)
+; RV64I-NEXT:    srli a4, a4, 1
+; RV64I-NEXT:    srl a4, a4, a6
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    sll a1, a5, a1
 ; RV64I-NEXT:    sb a1, 0(a2)
+; RV64I-NEXT:    srli a4, a1, 56
+; RV64I-NEXT:    sb a4, 7(a2)
+; RV64I-NEXT:    srli a4, a1, 48
+; RV64I-NEXT:    sb a4, 6(a2)
+; RV64I-NEXT:    srli a4, a1, 40
+; RV64I-NEXT:    sb a4, 5(a2)
+; RV64I-NEXT:    srli a4, a1, 32
+; RV64I-NEXT:    sb a4, 4(a2)
+; RV64I-NEXT:    srli a4, a1, 24
+; RV64I-NEXT:    sb a4, 3(a2)
+; RV64I-NEXT:    srli a4, a1, 16
+; RV64I-NEXT:    sb a4, 2(a2)
 ; RV64I-NEXT:    srli a1, a1, 8
 ; RV64I-NEXT:    sb a1, 1(a2)
+; RV64I-NEXT:    sb a3, 16(a2)
+; RV64I-NEXT:    sb a7, 24(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    srli a1, a3, 56
-; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    sb a1, 23(a2)
 ; RV64I-NEXT:    srli a1, a3, 48
-; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    sb a1, 22(a2)
 ; RV64I-NEXT:    srli a1, a3, 40
-; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    sb a1, 21(a2)
 ; RV64I-NEXT:    srli a1, a3, 32
-; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    sb a1, 20(a2)
 ; RV64I-NEXT:    srli a1, a3, 24
-; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    sb a1, 19(a2)
 ; RV64I-NEXT:    srli a1, a3, 16
-; RV64I-NEXT:    sb a1, 10(a2)
-; RV64I-NEXT:    or a0, a3, a0
+; RV64I-NEXT:    sb a1, 18(a2)
 ; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 9(a2)
-; RV64I-NEXT:    sb a4, 16(a2)
-; RV64I-NEXT:    sb a6, 24(a2)
-; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 224
+; RV64I-NEXT:    sb a3, 17(a2)
+; RV64I-NEXT:    srli a1, a7, 56
+; RV64I-NEXT:    sb a1, 31(a2)
+; RV64I-NEXT:    srli a1, a7, 48
+; RV64I-NEXT:    sb a1, 30(a2)
+; RV64I-NEXT:    srli a1, a7, 40
+; RV64I-NEXT:    sb a1, 29(a2)
+; RV64I-NEXT:    srli a1, a7, 32
+; RV64I-NEXT:    sb a1, 28(a2)
+; RV64I-NEXT:    srli a1, a7, 24
+; RV64I-NEXT:    sb a1, 27(a2)
+; RV64I-NEXT:    srli a1, a7, 16
+; RV64I-NEXT:    sb a1, 26(a2)
+; RV64I-NEXT:    srli a1, a7, 8
+; RV64I-NEXT:    sb a1, 25(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    addi sp, sp, 64
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -144
-; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -64
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
-; RV32I-NEXT:    lbu s3, 15(a0)
-; RV32I-NEXT:    lbu s4, 16(a0)
-; RV32I-NEXT:    lbu s5, 17(a0)
-; RV32I-NEXT:    lbu s6, 18(a0)
-; RV32I-NEXT:    lbu s7, 19(a0)
-; RV32I-NEXT:    lbu s10, 1(a1)
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s11, 0(a1)
-; RV32I-NEXT:    slli s10, s10, 8
-; RV32I-NEXT:    lbu ra, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    or s10, s10, s11
-; RV32I-NEXT:    lbu s11, 22(a0)
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, ra
-; RV32I-NEXT:    lbu ra, 23(a0)
-; RV32I-NEXT:    or t0, a1, s10
-; RV32I-NEXT:    lbu s10, 24(a0)
-; RV32I-NEXT:    lbu a7, 25(a0)
-; RV32I-NEXT:    lbu a6, 26(a0)
-; RV32I-NEXT:    lbu a5, 27(a0)
-; RV32I-NEXT:    lbu a1, 31(a0)
-; RV32I-NEXT:    lbu a3, 30(a0)
-; RV32I-NEXT:    lbu a4, 29(a0)
-; RV32I-NEXT:    lbu a0, 28(a0)
-; RV32I-NEXT:    sb a1, 91(sp)
-; RV32I-NEXT:    sb a3, 90(sp)
-; RV32I-NEXT:    sb a4, 89(sp)
-; RV32I-NEXT:    sb a0, 88(sp)
-; RV32I-NEXT:    sb a5, 87(sp)
-; RV32I-NEXT:    sb a6, 86(sp)
-; RV32I-NEXT:    sb a7, 85(sp)
-; RV32I-NEXT:    sb s10, 84(sp)
-; RV32I-NEXT:    sb ra, 83(sp)
-; RV32I-NEXT:    sb s11, 82(sp)
-; RV32I-NEXT:    sb s9, 81(sp)
-; RV32I-NEXT:    sb s8, 80(sp)
-; RV32I-NEXT:    sb s7, 79(sp)
-; RV32I-NEXT:    sb s6, 78(sp)
-; RV32I-NEXT:    sb s5, 77(sp)
-; RV32I-NEXT:    sb s4, 76(sp)
-; RV32I-NEXT:    sb zero, 59(sp)
-; RV32I-NEXT:    sb zero, 58(sp)
-; RV32I-NEXT:    sb zero, 57(sp)
-; RV32I-NEXT:    sb zero, 56(sp)
-; RV32I-NEXT:    sb zero, 55(sp)
-; RV32I-NEXT:    sb zero, 54(sp)
-; RV32I-NEXT:    sb zero, 53(sp)
-; RV32I-NEXT:    sb zero, 52(sp)
-; RV32I-NEXT:    sb zero, 51(sp)
-; RV32I-NEXT:    sb zero, 50(sp)
-; RV32I-NEXT:    sb zero, 49(sp)
-; RV32I-NEXT:    sb zero, 48(sp)
-; RV32I-NEXT:    sb zero, 47(sp)
-; RV32I-NEXT:    sb zero, 46(sp)
-; RV32I-NEXT:    sb zero, 45(sp)
-; RV32I-NEXT:    sb zero, 44(sp)
-; RV32I-NEXT:    sb zero, 43(sp)
-; RV32I-NEXT:    sb zero, 42(sp)
-; RV32I-NEXT:    sb zero, 41(sp)
-; RV32I-NEXT:    sb zero, 40(sp)
-; RV32I-NEXT:    sb zero, 39(sp)
-; RV32I-NEXT:    sb zero, 38(sp)
-; RV32I-NEXT:    sb zero, 37(sp)
-; RV32I-NEXT:    sb zero, 36(sp)
-; RV32I-NEXT:    sb zero, 35(sp)
-; RV32I-NEXT:    sb zero, 34(sp)
-; RV32I-NEXT:    sb zero, 33(sp)
-; RV32I-NEXT:    sb zero, 32(sp)
-; RV32I-NEXT:    sb zero, 31(sp)
-; RV32I-NEXT:    sb zero, 30(sp)
-; RV32I-NEXT:    sb zero, 29(sp)
-; RV32I-NEXT:    sb zero, 28(sp)
-; RV32I-NEXT:    sb s3, 75(sp)
-; RV32I-NEXT:    sb s2, 74(sp)
-; RV32I-NEXT:    sb s1, 73(sp)
-; RV32I-NEXT:    sb s0, 72(sp)
-; RV32I-NEXT:    sb t6, 71(sp)
-; RV32I-NEXT:    sb t5, 70(sp)
-; RV32I-NEXT:    sb t4, 69(sp)
-; RV32I-NEXT:    sb t3, 68(sp)
-; RV32I-NEXT:    sb t2, 67(sp)
-; RV32I-NEXT:    sb t1, 66(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 65(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 64(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 63(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 62(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 61(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 60(sp)
-; RV32I-NEXT:    slli a0, t0, 24
-; RV32I-NEXT:    srli a0, a0, 27
-; RV32I-NEXT:    addi a4, sp, 60
-; RV32I-NEXT:    sub a4, a4, a0
-; RV32I-NEXT:    lbu a0, 5(a4)
-; RV32I-NEXT:    lbu a1, 4(a4)
-; RV32I-NEXT:    lbu a3, 6(a4)
-; RV32I-NEXT:    lbu a5, 7(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a3, a3, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or t5, a3, a0
-; RV32I-NEXT:    andi a1, t0, 7
-; RV32I-NEXT:    lbu a0, 1(a4)
-; RV32I-NEXT:    lbu a3, 0(a4)
-; RV32I-NEXT:    lbu a5, 2(a4)
-; RV32I-NEXT:    lbu a6, 3(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a3, a6, a5
-; RV32I-NEXT:    or a6, a3, a0
-; RV32I-NEXT:    srli a0, a6, 1
-; RV32I-NEXT:    xori a7, a1, 31
-; RV32I-NEXT:    srl a0, a0, a7
-; RV32I-NEXT:    lbu a3, 13(a4)
-; RV32I-NEXT:    lbu a5, 12(a4)
-; RV32I-NEXT:    lbu t0, 14(a4)
-; RV32I-NEXT:    lbu t1, 15(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a5, t1, t0
-; RV32I-NEXT:    or t0, a5, a3
-; RV32I-NEXT:    lbu a3, 9(a4)
-; RV32I-NEXT:    lbu a5, 8(a4)
-; RV32I-NEXT:    lbu t1, 10(a4)
-; RV32I-NEXT:    lbu t2, 11(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a5
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a5, t2, t1
-; RV32I-NEXT:    or t1, a5, a3
-; RV32I-NEXT:    srli a3, t1, 1
-; RV32I-NEXT:    srl a5, a3, a7
-; RV32I-NEXT:    srli t4, t5, 1
-; RV32I-NEXT:    not t2, a1
-; RV32I-NEXT:    lbu a3, 21(a4)
-; RV32I-NEXT:    lbu t3, 20(a4)
-; RV32I-NEXT:    lbu t6, 22(a4)
-; RV32I-NEXT:    lbu s0, 23(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, t3
-; RV32I-NEXT:    slli t6, t6, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    or t3, s0, t6
-; RV32I-NEXT:    or t3, t3, a3
-; RV32I-NEXT:    lbu a3, 17(a4)
-; RV32I-NEXT:    lbu t6, 16(a4)
-; RV32I-NEXT:    lbu s0, 18(a4)
-; RV32I-NEXT:    lbu s1, 19(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, t6
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or s0, s0, a3
-; RV32I-NEXT:    lbu a3, 29(a4)
-; RV32I-NEXT:    lbu t6, 28(a4)
-; RV32I-NEXT:    lbu s1, 30(a4)
-; RV32I-NEXT:    lbu s2, 31(a4)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, t6
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or t6, s2, s1
-; RV32I-NEXT:    lbu s1, 25(a4)
-; RV32I-NEXT:    lbu s2, 24(a4)
-; RV32I-NEXT:    srl t4, t4, t2
-; RV32I-NEXT:    or t6, t6, a3
-; RV32I-NEXT:    slli s1, s1, 8
-; RV32I-NEXT:    or a3, s1, s2
-; RV32I-NEXT:    lbu s1, 26(a4)
-; RV32I-NEXT:    lbu a4, 27(a4)
-; RV32I-NEXT:    srli s2, s0, 1
-; RV32I-NEXT:    srl s2, s2, a7
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    srli s1, t0, 1
-; RV32I-NEXT:    srl s1, s1, t2
-; RV32I-NEXT:    or a4, a4, a3
-; RV32I-NEXT:    srli a3, a4, 1
-; RV32I-NEXT:    srl a7, a3, a7
-; RV32I-NEXT:    srli a3, t3, 1
-; RV32I-NEXT:    srl t2, a3, t2
-; RV32I-NEXT:    sll a3, t5, a1
-; RV32I-NEXT:    sll t0, t0, a1
-; RV32I-NEXT:    sll t1, t1, a1
-; RV32I-NEXT:    sll t3, t3, a1
-; RV32I-NEXT:    sll t5, s0, a1
-; RV32I-NEXT:    sll t6, t6, a1
-; RV32I-NEXT:    sll a4, a4, a1
-; RV32I-NEXT:    sll a1, a6, a1
-; RV32I-NEXT:    srli a6, a4, 24
-; RV32I-NEXT:    sb a6, 27(a2)
-; RV32I-NEXT:    srli a6, a4, 16
-; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    or a6, a4, t2
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or t0, t0, a7
+; RV32I-NEXT:    lbu a7, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t1, t1, a7
+; RV32I-NEXT:    lbu a7, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t2
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t2, t2, a7
+; RV32I-NEXT:    lbu a7, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or a0, a0, a7
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    lbu t3, 0(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t3
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    or a7, a1, a7
+; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    sw zero, 24(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    sw zero, 8(sp)
+; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw a0, 60(sp)
+; RV32I-NEXT:    sw t2, 56(sp)
+; RV32I-NEXT:    sw t1, 52(sp)
+; RV32I-NEXT:    sw t0, 48(sp)
+; RV32I-NEXT:    sw a6, 44(sp)
+; RV32I-NEXT:    sw a5, 40(sp)
+; RV32I-NEXT:    sw a4, 36(sp)
+; RV32I-NEXT:    sw a3, 32(sp)
+; RV32I-NEXT:    srli a0, a7, 3
+; RV32I-NEXT:    andi a0, a0, 28
+; RV32I-NEXT:    addi a1, sp, 32
+; RV32I-NEXT:    sub a4, a1, a0
+; RV32I-NEXT:    lw a3, 4(a4)
+; RV32I-NEXT:    lw a5, 0(a4)
+; RV32I-NEXT:    sll a0, a3, a7
+; RV32I-NEXT:    andi a1, a7, 31
+; RV32I-NEXT:    xori a6, a1, 31
+; RV32I-NEXT:    srli a1, a5, 1
+; RV32I-NEXT:    lw t0, 12(a4)
+; RV32I-NEXT:    lw t1, 8(a4)
+; RV32I-NEXT:    srl a1, a1, a6
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    sll a1, t0, a7
+; RV32I-NEXT:    srli t2, t1, 1
+; RV32I-NEXT:    srl t2, t2, a6
+; RV32I-NEXT:    or a1, a1, t2
+; RV32I-NEXT:    sll t1, t1, a7
+; RV32I-NEXT:    srli a3, a3, 1
+; RV32I-NEXT:    lw t2, 20(a4)
+; RV32I-NEXT:    lw t3, 16(a4)
+; RV32I-NEXT:    srl a3, a3, a6
+; RV32I-NEXT:    or a3, t1, a3
+; RV32I-NEXT:    sll t1, t2, a7
+; RV32I-NEXT:    srli t4, t3, 1
+; RV32I-NEXT:    srl t4, t4, a6
+; RV32I-NEXT:    or t1, t1, t4
+; RV32I-NEXT:    sll t3, t3, a7
+; RV32I-NEXT:    srli t0, t0, 1
+; RV32I-NEXT:    lw t4, 28(a4)
+; RV32I-NEXT:    lw a4, 24(a4)
+; RV32I-NEXT:    srl t0, t0, a6
+; RV32I-NEXT:    or t0, t3, t0
+; RV32I-NEXT:    sll t3, t4, a7
+; RV32I-NEXT:    srli t4, a4, 1
+; RV32I-NEXT:    srl t4, t4, a6
+; RV32I-NEXT:    or t3, t3, t4
+; RV32I-NEXT:    sll a4, a4, a7
+; RV32I-NEXT:    srli t2, t2, 1
+; RV32I-NEXT:    srl a6, t2, a6
+; RV32I-NEXT:    or a4, a4, a6
+; RV32I-NEXT:    sll a5, a5, a7
+; RV32I-NEXT:    sb a5, 0(a2)
+; RV32I-NEXT:    srli a6, a5, 24
+; RV32I-NEXT:    sb a6, 3(a2)
+; RV32I-NEXT:    srli a6, a5, 16
+; RV32I-NEXT:    sb a6, 2(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 1(a2)
+; RV32I-NEXT:    sb a4, 24(a2)
+; RV32I-NEXT:    sb t3, 28(a2)
+; RV32I-NEXT:    sb t0, 16(a2)
+; RV32I-NEXT:    sb t1, 20(a2)
+; RV32I-NEXT:    sb a3, 8(a2)
+; RV32I-NEXT:    sb a1, 12(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli a5, a4, 24
+; RV32I-NEXT:    sb a5, 27(a2)
+; RV32I-NEXT:    srli a5, a4, 16
+; RV32I-NEXT:    sb a5, 26(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
 ; RV32I-NEXT:    sb a4, 25(a2)
-; RV32I-NEXT:    srli a4, t6, 24
+; RV32I-NEXT:    srli a4, t3, 24
 ; RV32I-NEXT:    sb a4, 31(a2)
-; RV32I-NEXT:    srli a4, t6, 16
+; RV32I-NEXT:    srli a4, t3, 16
 ; RV32I-NEXT:    sb a4, 30(a2)
-; RV32I-NEXT:    or a4, t6, a7
-; RV32I-NEXT:    srli a7, t6, 8
-; RV32I-NEXT:    sb a7, 29(a2)
-; RV32I-NEXT:    srli a7, t5, 24
-; RV32I-NEXT:    sb a7, 19(a2)
-; RV32I-NEXT:    srli a7, t5, 16
-; RV32I-NEXT:    sb a7, 18(a2)
-; RV32I-NEXT:    or a7, t5, s1
-; RV32I-NEXT:    srli t2, t5, 8
-; RV32I-NEXT:    sb t2, 17(a2)
-; RV32I-NEXT:    srli t2, t3, 24
-; RV32I-NEXT:    sb t2, 23(a2)
-; RV32I-NEXT:    srli t2, t3, 16
-; RV32I-NEXT:    sb t2, 22(a2)
-; RV32I-NEXT:    or t2, t3, s2
-; RV32I-NEXT:    srli t3, t3, 8
-; RV32I-NEXT:    sb t3, 21(a2)
-; RV32I-NEXT:    srli t3, t1, 24
-; RV32I-NEXT:    sb t3, 11(a2)
-; RV32I-NEXT:    srli t3, t1, 16
-; RV32I-NEXT:    sb t3, 10(a2)
-; RV32I-NEXT:    or t3, t1, t4
-; RV32I-NEXT:    srli t1, t1, 8
-; RV32I-NEXT:    sb t1, 9(a2)
-; RV32I-NEXT:    srli t1, t0, 24
-; RV32I-NEXT:    sb t1, 15(a2)
-; RV32I-NEXT:    srli t1, t0, 16
-; RV32I-NEXT:    sb t1, 14(a2)
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    srli t0, t0, 8
-; RV32I-NEXT:    sb t0, 13(a2)
-; RV32I-NEXT:    srli t0, a1, 24
-; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    srli t0, a1, 16
-; RV32I-NEXT:    sb t0, 2(a2)
-; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli a4, t3, 8
+; RV32I-NEXT:    sb a4, 29(a2)
+; RV32I-NEXT:    srli a4, t0, 24
+; RV32I-NEXT:    sb a4, 19(a2)
+; RV32I-NEXT:    srli a4, t0, 16
+; RV32I-NEXT:    sb a4, 18(a2)
+; RV32I-NEXT:    srli a4, t0, 8
+; RV32I-NEXT:    sb a4, 17(a2)
+; RV32I-NEXT:    srli a4, t1, 24
+; RV32I-NEXT:    sb a4, 23(a2)
+; RV32I-NEXT:    srli a4, t1, 16
+; RV32I-NEXT:    sb a4, 22(a2)
+; RV32I-NEXT:    srli a4, t1, 8
+; RV32I-NEXT:    sb a4, 21(a2)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 11(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 9(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 14(a2)
 ; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    sb a1, 1(a2)
-; RV32I-NEXT:    srli a1, a3, 24
+; RV32I-NEXT:    sb a1, 13(a2)
+; RV32I-NEXT:    srli a1, a0, 24
 ; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    srli a1, a3, 16
+; RV32I-NEXT:    srli a1, a0, 16
 ; RV32I-NEXT:    sb a1, 6(a2)
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 5(a2)
-; RV32I-NEXT:    sb a6, 24(a2)
-; RV32I-NEXT:    sb a4, 28(a2)
-; RV32I-NEXT:    sb a7, 16(a2)
-; RV32I-NEXT:    sb t2, 20(a2)
-; RV32I-NEXT:    sb t3, 8(a2)
-; RV32I-NEXT:    sb a5, 12(a2)
-; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 144
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 64
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2762,200 +2147,43 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -224
-; RV64I-NEXT:    sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t1, 31(a0)
-; RV64I-NEXT:    lbu a3, 0(a0)
-; RV64I-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -64
 ; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 3(a0)
-; RV64I-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu a3, 5(a0)
-; RV64I-NEXT:    sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lbu t3, 6(a0)
-; RV64I-NEXT:    lbu t4, 7(a0)
-; RV64I-NEXT:    lbu t5, 8(a0)
-; RV64I-NEXT:    lbu t6, 9(a0)
-; RV64I-NEXT:    lbu s0, 10(a0)
-; RV64I-NEXT:    lbu s1, 11(a0)
-; RV64I-NEXT:    lbu s2, 12(a0)
-; RV64I-NEXT:    lbu s3, 13(a0)
-; RV64I-NEXT:    lbu s4, 14(a0)
-; RV64I-NEXT:    lbu s5, 15(a0)
-; RV64I-NEXT:    lbu s6, 16(a0)
-; RV64I-NEXT:    lbu s7, 17(a0)
-; RV64I-NEXT:    lbu s8, 18(a0)
-; RV64I-NEXT:    lbu s9, 19(a0)
-; RV64I-NEXT:    lbu a3, 1(a1)
-; RV64I-NEXT:    lbu s10, 0(a1)
-; RV64I-NEXT:    lbu s11, 2(a1)
-; RV64I-NEXT:    lbu ra, 3(a1)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, s10
-; RV64I-NEXT:    slli s11, s11, 16
-; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    lbu s10, 5(a1)
-; RV64I-NEXT:    or s11, ra, s11
-; RV64I-NEXT:    or a3, s11, a3
-; RV64I-NEXT:    lbu s11, 4(a1)
-; RV64I-NEXT:    slli s10, s10, 8
-; RV64I-NEXT:    lbu ra, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    or s10, s10, s11
-; RV64I-NEXT:    lbu s11, 20(a0)
-; RV64I-NEXT:    slli ra, ra, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, ra
-; RV64I-NEXT:    lbu ra, 21(a0)
-; RV64I-NEXT:    or a1, a1, s10
-; RV64I-NEXT:    lbu s10, 22(a0)
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or t2, a1, a3
-; RV64I-NEXT:    lbu t0, 23(a0)
-; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu a6, 25(a0)
-; RV64I-NEXT:    lbu a5, 26(a0)
-; RV64I-NEXT:    lbu a1, 30(a0)
-; RV64I-NEXT:    lbu a3, 29(a0)
-; RV64I-NEXT:    lbu a4, 28(a0)
-; RV64I-NEXT:    lbu a0, 27(a0)
-; RV64I-NEXT:    sb a1, 86(sp)
-; RV64I-NEXT:    sb a3, 85(sp)
-; RV64I-NEXT:    sb a4, 84(sp)
-; RV64I-NEXT:    sb a0, 83(sp)
-; RV64I-NEXT:    sb a5, 82(sp)
-; RV64I-NEXT:    sb a6, 81(sp)
-; RV64I-NEXT:    sb a7, 80(sp)
-; RV64I-NEXT:    sb t0, 79(sp)
-; RV64I-NEXT:    sb s10, 78(sp)
-; RV64I-NEXT:    sb ra, 77(sp)
-; RV64I-NEXT:    sb s11, 76(sp)
-; RV64I-NEXT:    sb s9, 75(sp)
-; RV64I-NEXT:    sb s8, 74(sp)
-; RV64I-NEXT:    sb s7, 73(sp)
-; RV64I-NEXT:    sb s6, 72(sp)
-; RV64I-NEXT:    sb s5, 71(sp)
-; RV64I-NEXT:    sb s4, 70(sp)
-; RV64I-NEXT:    sb s3, 69(sp)
-; RV64I-NEXT:    sb s2, 68(sp)
-; RV64I-NEXT:    sb s1, 67(sp)
-; RV64I-NEXT:    sb s0, 66(sp)
-; RV64I-NEXT:    sb t6, 65(sp)
-; RV64I-NEXT:    sb t5, 64(sp)
-; RV64I-NEXT:    sb t1, 87(sp)
-; RV64I-NEXT:    slli t1, t1, 56
-; RV64I-NEXT:    sb t4, 63(sp)
-; RV64I-NEXT:    sb t3, 62(sp)
-; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 61(sp)
-; RV64I-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 60(sp)
-; RV64I-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 59(sp)
-; RV64I-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 58(sp)
-; RV64I-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 57(sp)
-; RV64I-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    sb a0, 56(sp)
-; RV64I-NEXT:    srai a0, t1, 63
-; RV64I-NEXT:    sb a0, 112(sp)
-; RV64I-NEXT:    sb a0, 104(sp)
-; RV64I-NEXT:    sb a0, 96(sp)
-; RV64I-NEXT:    sb a0, 88(sp)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    sb a1, 119(sp)
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    sb a3, 118(sp)
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    sb a4, 117(sp)
-; RV64I-NEXT:    srli a5, a0, 32
-; RV64I-NEXT:    sb a5, 116(sp)
-; RV64I-NEXT:    srli a6, a0, 24
-; RV64I-NEXT:    sb a6, 115(sp)
-; RV64I-NEXT:    srli a7, a0, 16
-; RV64I-NEXT:    sb a7, 114(sp)
-; RV64I-NEXT:    srli a0, a0, 8
-; RV64I-NEXT:    sb a0, 113(sp)
-; RV64I-NEXT:    sb a1, 111(sp)
-; RV64I-NEXT:    sb a3, 110(sp)
-; RV64I-NEXT:    sb a4, 109(sp)
-; RV64I-NEXT:    sb a5, 108(sp)
-; RV64I-NEXT:    sb a6, 107(sp)
-; RV64I-NEXT:    sb a7, 106(sp)
-; RV64I-NEXT:    sb a0, 105(sp)
-; RV64I-NEXT:    sb a1, 103(sp)
-; RV64I-NEXT:    sb a3, 102(sp)
-; RV64I-NEXT:    sb a4, 101(sp)
-; RV64I-NEXT:    sb a5, 100(sp)
-; RV64I-NEXT:    sb a6, 99(sp)
-; RV64I-NEXT:    sb a7, 98(sp)
-; RV64I-NEXT:    sb a0, 97(sp)
-; RV64I-NEXT:    sb a1, 95(sp)
-; RV64I-NEXT:    sb a3, 94(sp)
-; RV64I-NEXT:    sb a4, 93(sp)
-; RV64I-NEXT:    sb a5, 92(sp)
-; RV64I-NEXT:    sb a6, 91(sp)
-; RV64I-NEXT:    sb a7, 90(sp)
-; RV64I-NEXT:    sb a0, 89(sp)
-; RV64I-NEXT:    slli a0, t2, 56
-; RV64I-NEXT:    srli a0, a0, 59
-; RV64I-NEXT:    addi a1, sp, 56
-; RV64I-NEXT:    add a1, a1, a0
-; RV64I-NEXT:    lbu a0, 9(a1)
-; RV64I-NEXT:    lbu a3, 8(a1)
-; RV64I-NEXT:    lbu a4, 10(a1)
-; RV64I-NEXT:    lbu a5, 11(a1)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    or a0, a4, a0
-; RV64I-NEXT:    lbu a3, 13(a1)
-; RV64I-NEXT:    lbu a4, 12(a1)
-; RV64I-NEXT:    lbu a5, 14(a1)
-; RV64I-NEXT:    lbu a6, 15(a1)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a4, a3, a0
-; RV64I-NEXT:    andi a3, t2, 7
-; RV64I-NEXT:    lbu a0, 17(a1)
-; RV64I-NEXT:    lbu a5, 16(a1)
-; RV64I-NEXT:    lbu a6, 18(a1)
-; RV64I-NEXT:    lbu a7, 19(a1)
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
 ; RV64I-NEXT:    or a5, a7, a6
-; RV64I-NEXT:    or a0, a5, a0
-; RV64I-NEXT:    lbu a5, 21(a1)
-; RV64I-NEXT:    lbu a6, 20(a1)
-; RV64I-NEXT:    lbu a7, 22(a1)
-; RV64I-NEXT:    lbu t0, 23(a1)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu a5, 8(a0)
+; RV64I-NEXT:    lbu a6, 10(a0)
+; RV64I-NEXT:    lbu a7, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 13(a0)
+; RV64I-NEXT:    lbu a6, 12(a0)
+; RV64I-NEXT:    lbu a7, 14(a0)
+; RV64I-NEXT:    lbu t0, 15(a0)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
@@ -2963,467 +2191,378 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a6, t0, a7
 ; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a5, a5, a0
-; RV64I-NEXT:    slli a0, a5, 1
-; RV64I-NEXT:    not a6, a3
-; RV64I-NEXT:    sll a0, a0, a6
-; RV64I-NEXT:    lbu a6, 1(a1)
-; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 2(a1)
-; RV64I-NEXT:    lbu t1, 3(a1)
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu a6, 16(a0)
+; RV64I-NEXT:    lbu a7, 18(a0)
+; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 21(a0)
+; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 22(a0)
+; RV64I-NEXT:    lbu t1, 23(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
 ; RV64I-NEXT:    or a7, t1, t0
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 5(a1)
-; RV64I-NEXT:    lbu t0, 4(a1)
-; RV64I-NEXT:    lbu t1, 6(a1)
-; RV64I-NEXT:    lbu t2, 7(a1)
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 25(a0)
+; RV64I-NEXT:    lbu a7, 24(a0)
+; RV64I-NEXT:    lbu t0, 26(a0)
+; RV64I-NEXT:    lbu t1, 27(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 29(a0)
+; RV64I-NEXT:    lbu t0, 28(a0)
+; RV64I-NEXT:    lbu t1, 30(a0)
+; RV64I-NEXT:    lbu a0, 31(a0)
 ; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    or a7, a7, t0
 ; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or t0, t2, t1
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    slli a7, a0, 32
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 25(a1)
-; RV64I-NEXT:    lbu t0, 24(a1)
-; RV64I-NEXT:    lbu t1, 26(a1)
-; RV64I-NEXT:    lbu t2, 27(a1)
+; RV64I-NEXT:    lbu a7, 1(a1)
+; RV64I-NEXT:    lbu t0, 0(a1)
+; RV64I-NEXT:    lbu t1, 2(a1)
+; RV64I-NEXT:    lbu t2, 3(a1)
 ; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    or a7, a7, t0
 ; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli t2, t2, 24
 ; RV64I-NEXT:    or t0, t2, t1
 ; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    lbu t0, 29(a1)
-; RV64I-NEXT:    lbu t1, 28(a1)
-; RV64I-NEXT:    lbu t2, 30(a1)
-; RV64I-NEXT:    lbu a1, 31(a1)
+; RV64I-NEXT:    lbu t0, 5(a1)
+; RV64I-NEXT:    lbu t1, 4(a1)
+; RV64I-NEXT:    lbu t2, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
 ; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or t0, t0, t1
 ; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, t2
-; RV64I-NEXT:    slli t1, a4, 1
 ; RV64I-NEXT:    or a1, a1, t0
-; RV64I-NEXT:    xori t0, a3, 63
-; RV64I-NEXT:    sll t1, t1, t0
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a7, a1, a7
-; RV64I-NEXT:    slli a1, a7, 1
-; RV64I-NEXT:    sll t0, a1, t0
-; RV64I-NEXT:    srl a1, a4, a3
-; RV64I-NEXT:    srl a4, a6, a3
-; RV64I-NEXT:    srl a5, a5, a3
-; RV64I-NEXT:    sra a3, a7, a3
-; RV64I-NEXT:    srli a6, a5, 48
-; RV64I-NEXT:    sb a6, 22(a2)
-; RV64I-NEXT:    srli a6, a5, 40
-; RV64I-NEXT:    sb a6, 21(a2)
-; RV64I-NEXT:    srli a6, a5, 32
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    srli a6, a5, 24
-; RV64I-NEXT:    sb a6, 19(a2)
-; RV64I-NEXT:    srli a6, a5, 16
-; RV64I-NEXT:    sb a6, 18(a2)
-; RV64I-NEXT:    or a6, a5, t0
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    sraiw a0, a0, 31
+; RV64I-NEXT:    sd a0, 56(sp)
+; RV64I-NEXT:    sd a0, 48(sp)
+; RV64I-NEXT:    sd a0, 40(sp)
+; RV64I-NEXT:    sd a0, 32(sp)
+; RV64I-NEXT:    sd a6, 24(sp)
+; RV64I-NEXT:    sd a5, 16(sp)
+; RV64I-NEXT:    sd a4, 8(sp)
+; RV64I-NEXT:    sd a3, 0(sp)
+; RV64I-NEXT:    srli a0, a1, 3
+; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    mv a3, sp
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    ld a4, 8(a3)
+; RV64I-NEXT:    srl a0, a4, a1
+; RV64I-NEXT:    ld a5, 16(a3)
+; RV64I-NEXT:    andi a6, a1, 63
+; RV64I-NEXT:    xori a6, a6, 63
+; RV64I-NEXT:    ld a7, 0(a3)
+; RV64I-NEXT:    slli t0, a5, 1
+; RV64I-NEXT:    sll t0, t0, a6
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    srl a7, a7, a1
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    ld a3, 24(a3)
+; RV64I-NEXT:    sll a4, a4, a6
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    srl a5, a5, a1
+; RV64I-NEXT:    slli a7, a3, 1
+; RV64I-NEXT:    sll a6, a7, a6
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    sra a1, a3, a1
+; RV64I-NEXT:    sb a1, 24(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 31(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 30(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 29(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 28(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 27(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 26(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 25(a2)
 ; RV64I-NEXT:    sb a5, 16(a2)
+; RV64I-NEXT:    sb a4, 0(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a1, a5, 56
+; RV64I-NEXT:    sb a1, 23(a2)
+; RV64I-NEXT:    srli a1, a5, 48
+; RV64I-NEXT:    sb a1, 22(a2)
+; RV64I-NEXT:    srli a1, a5, 40
+; RV64I-NEXT:    sb a1, 21(a2)
+; RV64I-NEXT:    srli a1, a5, 32
+; RV64I-NEXT:    sb a1, 20(a2)
+; RV64I-NEXT:    srli a1, a5, 24
+; RV64I-NEXT:    sb a1, 19(a2)
+; RV64I-NEXT:    srli a1, a5, 16
+; RV64I-NEXT:    sb a1, 18(a2)
 ; RV64I-NEXT:    srli a5, a5, 8
 ; RV64I-NEXT:    sb a5, 17(a2)
-; RV64I-NEXT:    srli a5, a3, 56
-; RV64I-NEXT:    sb a5, 31(a2)
-; RV64I-NEXT:    srli a5, a3, 48
-; RV64I-NEXT:    sb a5, 30(a2)
-; RV64I-NEXT:    srli a5, a3, 40
-; RV64I-NEXT:    sb a5, 29(a2)
-; RV64I-NEXT:    srli a5, a3, 32
-; RV64I-NEXT:    sb a5, 28(a2)
-; RV64I-NEXT:    srli a5, a3, 24
-; RV64I-NEXT:    sb a5, 27(a2)
-; RV64I-NEXT:    srli a5, a3, 16
-; RV64I-NEXT:    sb a5, 26(a2)
-; RV64I-NEXT:    sb a3, 24(a2)
-; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 25(a2)
-; RV64I-NEXT:    srli a3, a4, 48
-; RV64I-NEXT:    sb a3, 6(a2)
-; RV64I-NEXT:    srli a3, a4, 40
-; RV64I-NEXT:    sb a3, 5(a2)
-; RV64I-NEXT:    srli a3, a4, 32
-; RV64I-NEXT:    sb a3, 4(a2)
-; RV64I-NEXT:    srli a3, a4, 24
-; RV64I-NEXT:    sb a3, 3(a2)
-; RV64I-NEXT:    srli a3, a4, 16
-; RV64I-NEXT:    sb a3, 2(a2)
-; RV64I-NEXT:    or a3, a4, t1
-; RV64I-NEXT:    sb a4, 0(a2)
+; RV64I-NEXT:    srli a1, a4, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a1, a4, 48
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    srli a1, a4, 40
+; RV64I-NEXT:    sb a1, 5(a2)
+; RV64I-NEXT:    srli a1, a4, 32
+; RV64I-NEXT:    sb a1, 4(a2)
+; RV64I-NEXT:    srli a1, a4, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a1, a4, 16
+; RV64I-NEXT:    sb a1, 2(a2)
 ; RV64I-NEXT:    srli a4, a4, 8
 ; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    srli a4, a1, 48
-; RV64I-NEXT:    sb a4, 14(a2)
-; RV64I-NEXT:    srli a4, a1, 40
-; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    srli a4, a1, 32
-; RV64I-NEXT:    sb a4, 12(a2)
-; RV64I-NEXT:    srli a4, a1, 24
-; RV64I-NEXT:    sb a4, 11(a2)
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    sb a4, 10(a2)
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    sb a1, 8(a2)
-; RV64I-NEXT:    srli a1, a1, 8
-; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    srli a1, a6, 56
-; RV64I-NEXT:    sb a1, 23(a2)
-; RV64I-NEXT:    srli a3, a3, 56
-; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a0, a0, 56
-; RV64I-NEXT:    sb a0, 15(a2)
-; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 224
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    addi sp, sp, 64
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: ashr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -144
-; RV32I-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t3, 31(a0)
-; RV32I-NEXT:    lbu a3, 0(a0)
-; RV32I-NEXT:    sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -64
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 3(a0)
-; RV32I-NEXT:    sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu t2, 6(a0)
-; RV32I-NEXT:    lbu t4, 7(a0)
-; RV32I-NEXT:    lbu t5, 8(a0)
-; RV32I-NEXT:    lbu t6, 9(a0)
-; RV32I-NEXT:    lbu s0, 10(a0)
-; RV32I-NEXT:    lbu s1, 11(a0)
-; RV32I-NEXT:    lbu s2, 12(a0)
-; RV32I-NEXT:    lbu s3, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    lbu s6, 16(a0)
-; RV32I-NEXT:    lbu s7, 17(a0)
-; RV32I-NEXT:    lbu s8, 18(a0)
-; RV32I-NEXT:    lbu a3, 1(a1)
-; RV32I-NEXT:    lbu s9, 19(a0)
-; RV32I-NEXT:    lbu s10, 20(a0)
-; RV32I-NEXT:    lbu s11, 0(a1)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
 ; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    lbu ra, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    or a3, a3, s11
-; RV32I-NEXT:    lbu s11, 21(a0)
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, ra
-; RV32I-NEXT:    lbu ra, 22(a0)
-; RV32I-NEXT:    or t1, a1, a3
-; RV32I-NEXT:    lbu t0, 23(a0)
-; RV32I-NEXT:    lbu a7, 24(a0)
-; RV32I-NEXT:    lbu a6, 25(a0)
-; RV32I-NEXT:    lbu a5, 26(a0)
-; RV32I-NEXT:    lbu a1, 30(a0)
-; RV32I-NEXT:    lbu a3, 29(a0)
-; RV32I-NEXT:    lbu a4, 28(a0)
-; RV32I-NEXT:    lbu a0, 27(a0)
-; RV32I-NEXT:    sb a1, 58(sp)
-; RV32I-NEXT:    sb a3, 57(sp)
-; RV32I-NEXT:    sb a4, 56(sp)
-; RV32I-NEXT:    sb a0, 55(sp)
-; RV32I-NEXT:    sb a5, 54(sp)
-; RV32I-NEXT:    sb a6, 53(sp)
-; RV32I-NEXT:    sb a7, 52(sp)
-; RV32I-NEXT:    sb t0, 51(sp)
-; RV32I-NEXT:    sb ra, 50(sp)
-; RV32I-NEXT:    sb s11, 49(sp)
-; RV32I-NEXT:    sb s10, 48(sp)
-; RV32I-NEXT:    sb s9, 47(sp)
-; RV32I-NEXT:    sb s8, 46(sp)
-; RV32I-NEXT:    sb s7, 45(sp)
-; RV32I-NEXT:    sb s6, 44(sp)
-; RV32I-NEXT:    sb s5, 43(sp)
-; RV32I-NEXT:    sb t3, 59(sp)
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    sb s4, 42(sp)
-; RV32I-NEXT:    sb s3, 41(sp)
-; RV32I-NEXT:    sb s2, 40(sp)
-; RV32I-NEXT:    sb s1, 39(sp)
-; RV32I-NEXT:    sb s0, 38(sp)
-; RV32I-NEXT:    sb t6, 37(sp)
-; RV32I-NEXT:    sb t5, 36(sp)
-; RV32I-NEXT:    sb t4, 35(sp)
-; RV32I-NEXT:    sb t2, 34(sp)
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 33(sp)
-; RV32I-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 32(sp)
-; RV32I-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 31(sp)
-; RV32I-NEXT:    lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 30(sp)
-; RV32I-NEXT:    lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 29(sp)
-; RV32I-NEXT:    lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    sb a0, 28(sp)
-; RV32I-NEXT:    srai a0, t3, 31
-; RV32I-NEXT:    sb a0, 88(sp)
-; RV32I-NEXT:    sb a0, 84(sp)
-; RV32I-NEXT:    sb a0, 80(sp)
-; RV32I-NEXT:    sb a0, 76(sp)
-; RV32I-NEXT:    sb a0, 72(sp)
-; RV32I-NEXT:    sb a0, 68(sp)
-; RV32I-NEXT:    sb a0, 64(sp)
-; RV32I-NEXT:    sb a0, 60(sp)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    sb a1, 91(sp)
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    sb a3, 90(sp)
-; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    sb a0, 89(sp)
-; RV32I-NEXT:    sb a1, 87(sp)
-; RV32I-NEXT:    sb a3, 86(sp)
-; RV32I-NEXT:    sb a0, 85(sp)
-; RV32I-NEXT:    sb a1, 83(sp)
-; RV32I-NEXT:    sb a3, 82(sp)
-; RV32I-NEXT:    sb a0, 81(sp)
-; RV32I-NEXT:    sb a1, 79(sp)
-; RV32I-NEXT:    sb a3, 78(sp)
-; RV32I-NEXT:    sb a0, 77(sp)
-; RV32I-NEXT:    sb a1, 75(sp)
-; RV32I-NEXT:    sb a3, 74(sp)
-; RV32I-NEXT:    sb a0, 73(sp)
-; RV32I-NEXT:    sb a1, 71(sp)
-; RV32I-NEXT:    sb a3, 70(sp)
-; RV32I-NEXT:    sb a0, 69(sp)
-; RV32I-NEXT:    sb a1, 67(sp)
-; RV32I-NEXT:    sb a3, 66(sp)
-; RV32I-NEXT:    sb a0, 65(sp)
-; RV32I-NEXT:    sb a1, 63(sp)
-; RV32I-NEXT:    sb a3, 62(sp)
-; RV32I-NEXT:    sb a0, 61(sp)
-; RV32I-NEXT:    slli a0, t1, 24
-; RV32I-NEXT:    srli a0, a0, 27
-; RV32I-NEXT:    addi a4, sp, 28
-; RV32I-NEXT:    add a4, a4, a0
-; RV32I-NEXT:    lbu a0, 5(a4)
-; RV32I-NEXT:    lbu a1, 4(a4)
-; RV32I-NEXT:    lbu a3, 6(a4)
-; RV32I-NEXT:    lbu a5, 7(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    slli a3, a3, 16
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or t5, a3, a0
-; RV32I-NEXT:    andi a3, t1, 7
-; RV32I-NEXT:    lbu a0, 9(a4)
-; RV32I-NEXT:    lbu a1, 8(a4)
-; RV32I-NEXT:    lbu a5, 10(a4)
-; RV32I-NEXT:    lbu a6, 11(a4)
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a1, a6, a5
-; RV32I-NEXT:    or a6, a1, a0
-; RV32I-NEXT:    slli a0, a6, 1
-; RV32I-NEXT:    not t1, a3
-; RV32I-NEXT:    sll a0, a0, t1
-; RV32I-NEXT:    lbu a1, 1(a4)
-; RV32I-NEXT:    lbu a5, 0(a4)
-; RV32I-NEXT:    lbu a7, 2(a4)
-; RV32I-NEXT:    lbu t0, 3(a4)
-; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a7, a7, 24
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 9(a0)
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 10(a0)
+; RV32I-NEXT:    lbu t0, 11(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or t0, a5, a1
-; RV32I-NEXT:    slli a1, t5, 1
-; RV32I-NEXT:    xori t2, a3, 31
-; RV32I-NEXT:    sll a1, a1, t2
-; RV32I-NEXT:    lbu a5, 13(a4)
-; RV32I-NEXT:    lbu a7, 12(a4)
-; RV32I-NEXT:    lbu t3, 14(a4)
-; RV32I-NEXT:    lbu t4, 15(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 13(a0)
+; RV32I-NEXT:    lbu a7, 12(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
+; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t1, t1, 24
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    lbu a7, 17(a0)
+; RV32I-NEXT:    lbu t0, 16(a0)
+; RV32I-NEXT:    lbu t1, 18(a0)
+; RV32I-NEXT:    lbu t2, 19(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or t0, t0, a7
+; RV32I-NEXT:    lbu a7, 21(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu t3, 23(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t1
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t1, t1, a7
+; RV32I-NEXT:    lbu a7, 25(a0)
+; RV32I-NEXT:    lbu t2, 24(a0)
+; RV32I-NEXT:    lbu t3, 26(a0)
+; RV32I-NEXT:    lbu t4, 27(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t2
 ; RV32I-NEXT:    slli t3, t3, 16
 ; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t3, a7, a5
-; RV32I-NEXT:    lbu a5, 17(a4)
-; RV32I-NEXT:    lbu a7, 16(a4)
-; RV32I-NEXT:    lbu t4, 18(a4)
-; RV32I-NEXT:    lbu t6, 19(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a7
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    or t2, t2, a7
+; RV32I-NEXT:    lbu a7, 29(a0)
+; RV32I-NEXT:    lbu t3, 28(a0)
+; RV32I-NEXT:    lbu t4, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t3
 ; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a7, t6, t4
-; RV32I-NEXT:    or t4, a7, a5
-; RV32I-NEXT:    slli a5, t4, 1
-; RV32I-NEXT:    sll a7, a5, t1
-; RV32I-NEXT:    lbu a5, 21(a4)
-; RV32I-NEXT:    lbu t6, 20(a4)
-; RV32I-NEXT:    lbu s0, 22(a4)
-; RV32I-NEXT:    lbu s1, 23(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s0, s0, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    or s0, s1, s0
-; RV32I-NEXT:    or s0, s0, a5
-; RV32I-NEXT:    lbu a5, 25(a4)
-; RV32I-NEXT:    lbu t6, 24(a4)
-; RV32I-NEXT:    lbu s1, 26(a4)
-; RV32I-NEXT:    lbu s2, 27(a4)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, t6
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or t6, s2, s1
-; RV32I-NEXT:    or t6, t6, a5
-; RV32I-NEXT:    lbu a5, 29(a4)
-; RV32I-NEXT:    lbu s1, 28(a4)
-; RV32I-NEXT:    slli s2, t6, 1
-; RV32I-NEXT:    sll t1, s2, t1
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, s1
-; RV32I-NEXT:    lbu s1, 30(a4)
-; RV32I-NEXT:    lbu a4, 31(a4)
-; RV32I-NEXT:    slli s2, t3, 1
-; RV32I-NEXT:    sll s2, s2, t2
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a4, a4, s1
-; RV32I-NEXT:    slli s1, s0, 1
-; RV32I-NEXT:    sll s1, s1, t2
-; RV32I-NEXT:    or s3, a4, a5
-; RV32I-NEXT:    slli a4, s3, 1
-; RV32I-NEXT:    sll t2, a4, t2
-; RV32I-NEXT:    srl a4, t5, a3
-; RV32I-NEXT:    srl a5, t0, a3
-; RV32I-NEXT:    srl t0, t3, a3
-; RV32I-NEXT:    srl a6, a6, a3
-; RV32I-NEXT:    srl t3, s0, a3
-; RV32I-NEXT:    srl t4, t4, a3
-; RV32I-NEXT:    srl t5, t6, a3
-; RV32I-NEXT:    sra a3, s3, a3
-; RV32I-NEXT:    srli t6, t5, 16
-; RV32I-NEXT:    sb t6, 26(a2)
-; RV32I-NEXT:    or t2, t5, t2
-; RV32I-NEXT:    sb t5, 24(a2)
-; RV32I-NEXT:    srli t5, t5, 8
-; RV32I-NEXT:    sb t5, 25(a2)
-; RV32I-NEXT:    srli t5, a3, 24
-; RV32I-NEXT:    sb t5, 31(a2)
-; RV32I-NEXT:    srli t5, a3, 16
-; RV32I-NEXT:    sb t5, 30(a2)
-; RV32I-NEXT:    sb a3, 28(a2)
-; RV32I-NEXT:    srli a3, a3, 8
-; RV32I-NEXT:    sb a3, 29(a2)
-; RV32I-NEXT:    srli a3, t4, 16
-; RV32I-NEXT:    sb a3, 18(a2)
-; RV32I-NEXT:    or a3, t4, s1
-; RV32I-NEXT:    sb t4, 16(a2)
-; RV32I-NEXT:    srli t4, t4, 8
-; RV32I-NEXT:    sb t4, 17(a2)
-; RV32I-NEXT:    srli t4, t3, 16
-; RV32I-NEXT:    sb t4, 22(a2)
-; RV32I-NEXT:    or t1, t3, t1
-; RV32I-NEXT:    sb t3, 20(a2)
-; RV32I-NEXT:    srli t3, t3, 8
-; RV32I-NEXT:    sb t3, 21(a2)
-; RV32I-NEXT:    srli t3, a6, 16
-; RV32I-NEXT:    sb t3, 10(a2)
-; RV32I-NEXT:    or t3, a6, s2
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    srli a6, a6, 8
-; RV32I-NEXT:    sb a6, 9(a2)
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    sb a6, 14(a2)
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    sb t0, 12(a2)
-; RV32I-NEXT:    srli a7, t0, 8
-; RV32I-NEXT:    sb a7, 13(a2)
-; RV32I-NEXT:    srli a7, a5, 16
-; RV32I-NEXT:    sb a7, 2(a2)
-; RV32I-NEXT:    or a1, a5, a1
-; RV32I-NEXT:    sb a5, 0(a2)
-; RV32I-NEXT:    srli a5, a5, 8
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    srli a5, a4, 16
-; RV32I-NEXT:    sb a5, 6(a2)
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or t3, a0, t4
+; RV32I-NEXT:    or t3, t3, a7
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    lbu t4, 0(a1)
+; RV32I-NEXT:    lbu t5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a7, a7, t4
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t5
+; RV32I-NEXT:    or a7, a1, a7
+; RV32I-NEXT:    srai a0, a0, 31
+; RV32I-NEXT:    sw a0, 60(sp)
+; RV32I-NEXT:    sw a0, 56(sp)
+; RV32I-NEXT:    sw a0, 52(sp)
+; RV32I-NEXT:    sw a0, 48(sp)
+; RV32I-NEXT:    sw a0, 44(sp)
+; RV32I-NEXT:    sw a0, 40(sp)
+; RV32I-NEXT:    sw a0, 36(sp)
+; RV32I-NEXT:    sw a0, 32(sp)
+; RV32I-NEXT:    sw t3, 28(sp)
+; RV32I-NEXT:    sw t2, 24(sp)
+; RV32I-NEXT:    sw t1, 20(sp)
+; RV32I-NEXT:    sw t0, 16(sp)
+; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    srli a0, a7, 3
+; RV32I-NEXT:    andi a0, a0, 28
+; RV32I-NEXT:    mv a1, sp
+; RV32I-NEXT:    add a4, a1, a0
+; RV32I-NEXT:    lw a1, 4(a4)
+; RV32I-NEXT:    srl a0, a1, a7
+; RV32I-NEXT:    lw a5, 8(a4)
+; RV32I-NEXT:    andi a3, a7, 31
+; RV32I-NEXT:    xori a6, a3, 31
+; RV32I-NEXT:    lw a3, 0(a4)
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    srl a3, a3, a7
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    lw t0, 12(a4)
+; RV32I-NEXT:    lw t1, 16(a4)
+; RV32I-NEXT:    sll a1, a1, a6
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    srl a3, t0, a7
+; RV32I-NEXT:    slli t2, t1, 1
+; RV32I-NEXT:    sll t2, t2, a6
+; RV32I-NEXT:    or a3, a3, t2
+; RV32I-NEXT:    srl a5, a5, a7
+; RV32I-NEXT:    slli t0, t0, 1
+; RV32I-NEXT:    lw t2, 20(a4)
+; RV32I-NEXT:    lw t3, 24(a4)
+; RV32I-NEXT:    sll t0, t0, a6
+; RV32I-NEXT:    or a5, a5, t0
+; RV32I-NEXT:    srl t0, t2, a7
+; RV32I-NEXT:    slli t4, t3, 1
+; RV32I-NEXT:    sll t4, t4, a6
+; RV32I-NEXT:    or t0, t0, t4
+; RV32I-NEXT:    srl t1, t1, a7
+; RV32I-NEXT:    slli t2, t2, 1
+; RV32I-NEXT:    lw a4, 28(a4)
+; RV32I-NEXT:    sll t2, t2, a6
+; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    srl t2, t3, a7
+; RV32I-NEXT:    slli t3, a4, 1
+; RV32I-NEXT:    sll a6, t3, a6
+; RV32I-NEXT:    or a6, t2, a6
+; RV32I-NEXT:    sra a4, a4, a7
+; RV32I-NEXT:    sb a4, 28(a2)
+; RV32I-NEXT:    srli a7, a4, 24
+; RV32I-NEXT:    sb a7, 31(a2)
+; RV32I-NEXT:    srli a7, a4, 16
+; RV32I-NEXT:    sb a7, 30(a2)
 ; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    srli a4, t2, 24
+; RV32I-NEXT:    sb a4, 29(a2)
+; RV32I-NEXT:    sb a6, 24(a2)
+; RV32I-NEXT:    sb t1, 16(a2)
+; RV32I-NEXT:    sb t0, 20(a2)
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a3, 12(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli a4, a6, 24
 ; RV32I-NEXT:    sb a4, 27(a2)
-; RV32I-NEXT:    srli a3, a3, 24
-; RV32I-NEXT:    sb a3, 19(a2)
-; RV32I-NEXT:    srli a3, t1, 24
-; RV32I-NEXT:    sb a3, 23(a2)
-; RV32I-NEXT:    srli a3, t3, 24
-; RV32I-NEXT:    sb a3, 11(a2)
-; RV32I-NEXT:    srli a3, a6, 24
-; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sb a1, 3(a2)
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 144
+; RV32I-NEXT:    srli a4, a6, 16
+; RV32I-NEXT:    sb a4, 26(a2)
+; RV32I-NEXT:    srli a4, a6, 8
+; RV32I-NEXT:    sb a4, 25(a2)
+; RV32I-NEXT:    srli a4, t1, 24
+; RV32I-NEXT:    sb a4, 19(a2)
+; RV32I-NEXT:    srli a4, t1, 16
+; RV32I-NEXT:    sb a4, 18(a2)
+; RV32I-NEXT:    srli a4, t1, 8
+; RV32I-NEXT:    sb a4, 17(a2)
+; RV32I-NEXT:    srli a4, t0, 24
+; RV32I-NEXT:    sb a4, 23(a2)
+; RV32I-NEXT:    srli a4, t0, 16
+; RV32I-NEXT:    sb a4, 22(a2)
+; RV32I-NEXT:    srli a4, t0, 8
+; RV32I-NEXT:    sb a4, 21(a2)
+; RV32I-NEXT:    srli a4, a5, 24
+; RV32I-NEXT:    sb a4, 11(a2)
+; RV32I-NEXT:    srli a4, a5, 16
+; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 15(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 14(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    addi sp, sp, 64
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1

diff  --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 1d3b015f3c5479..c350ed64280dd2 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -174,22 +174,23 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-LABEL: scalar_i128:
 ; X86:       # %bb.0: # %_udiv-special-cases
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $156, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $176, %esp
+; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl 16(%ebp), %edx
 ; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl 12(%ebp), %ecx
 ; X86-NEXT:    xorl %eax, %ecx
 ; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -198,32 +199,33 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sbbl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl 32(%ebp), %ebx
 ; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    xorl %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 28(%ebp), %edi
 ; X86-NEXT:    xorl %edx, %edi
 ; X86-NEXT:    subl %edx, %edi
-; X86-NEXT:    sbbl %edx, %ebp
 ; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %edx, %esi
 ; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    orl %eax, %edx
@@ -232,359 +234,357 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    bsrl %eax, %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    bsrl %ebp, %edx
+; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    bsrl %edi, %edi
 ; X86-NEXT:    xorl $31, %edi
 ; X86-NEXT:    orl $32, %edi
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testl %ebp, %ebp
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %edi
 ; X86-NEXT:    orl $64, %edi
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    cmovnel %ecx, %edi
-; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X86-NEXT:    bsrl %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    bsrl %eax, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    bsrl %ebp, %ecx
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    bsrl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    bsrl %ebx, %esi
 ; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    orl $32, %edx
-; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %esi, %edx
 ; X86-NEXT:    orl $64, %edx
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl %eax, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
 ; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    subl %edx, %edi
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    movl $127, %ecx
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    cmpl %edi, %ecx
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NEXT:    cmovnel %ebx, %edx
-; X86-NEXT:    cmovnel %ebx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    cmovnel %ebx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    cmovnel %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    cmovnel %ebx, %eax
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    jne .LBB4_8
-; X86-NEXT:  # %bb.1: # %_udiv-special-cases
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    xorl $127, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    jne .LBB4_1
+; X86-NEXT:  # %bb.8: # %_udiv-special-cases
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    xorl $127, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    je .LBB4_8
-; X86-NEXT:  # %bb.2: # %udiv-bb1
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    je .LBB4_9
+; X86-NEXT:  # %bb.5: # %udiv-bb1
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    xorps %xmm0, %xmm0
+; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    xorb $127, %al
-; X86-NEXT:    movb %al, %ch
-; X86-NEXT:    andb $7, %ch
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    xorb $127, %cl
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $15, %al
+; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %edi
-; X86-NEXT:    movl 148(%esp,%edi), %edx
-; X86-NEXT:    movl 152(%esp,%edi), %esi
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    notb %cl
-; X86-NEXT:    movl 144(%esp,%edi), %eax
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    shrl %ebp
-; X86-NEXT:    shrl %cl, %ebp
-; X86-NEXT:    orl %edx, %ebp
-; X86-NEXT:    movl 140(%esp,%edi), %edx
-; X86-NEXT:    movb %ch, %cl
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl 152(%esp,%eax), %esi
+; X86-NEXT:    movl 156(%esp,%eax), %edx
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 144(%esp,%eax), %edx
+; X86-NEXT:    movl 148(%esp,%eax), %eax
+; X86-NEXT:    shldl %cl, %eax, %esi
 ; X86-NEXT:    shldl %cl, %edx, %eax
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    addl $1, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    jae .LBB4_3
+; X86-NEXT:    jae .LBB4_2
 ; X86-NEXT:  # %bb.6:
-; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    jmp .LBB4_7
-; X86-NEXT:  .LBB4_3: # %udiv-preheader
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %esi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    jmp .LBB4_7
+; X86-NEXT:  .LBB4_1:
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    jmp .LBB4_9
+; X86-NEXT:  .LBB4_2: # %udiv-preheader
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl 108(%esp,%eax), %edx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movb %dl, %ch
-; X86-NEXT:    andb $7, %ch
-; X86-NEXT:    movb %dl, %cl
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl %cl, %edx
-; X86-NEXT:    movl 104(%esp,%edx), %ebx
-; X86-NEXT:    movl 100(%esp,%edx), %edi
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %ebx, %ebp
-; X86-NEXT:    movl 92(%esp,%edx), %esi
+; X86-NEXT:    movl 104(%esp,%eax), %ebx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    shrdl %cl, %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 96(%esp,%edx), %esi
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    shrl %cl, %edx
-; X86-NEXT:    notb %cl
-; X86-NEXT:    addl %edi, %edi
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl 96(%esp,%eax), %esi
+; X86-NEXT:    movl 100(%esp,%eax), %eax
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrl %cl, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    shrdl %cl, %ebx, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB4_4: # %udiv-do-while
+; X86-NEXT:  .LBB4_3: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    shldl $1, %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebp, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebp, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %ebp
+; X86-NEXT:    shldl $1, %ebx, %edx
+; X86-NEXT:    shldl $1, %ecx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %edx
+; X86-NEXT:    shldl $1, %edi, %ecx
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    shldl $1, %ecx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %ecx
 ; X86-NEXT:    orl %esi, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %edi, %edi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    andl $1, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    subl %ecx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    subl %ecx, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %esi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    jne .LBB4_4
-; X86-NEXT:  # %bb.5:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    jne .LBB4_3
+; X86-NEXT:  # %bb.4:
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
-; X86-NEXT:    shldl $1, %ebp, %edx
+; X86-NEXT:    shldl $1, %ebx, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    shldl $1, %eax, %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    shldl $1, %edi, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    addl %edi, %edx
 ; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    shldl $1, %eax, %ebp
-; X86-NEXT:    orl %ecx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl $1, %esi, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    addl %esi, %esi
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:  .LBB4_8: # %udiv-end
+; X86-NEXT:  .LBB4_9: # %udiv-end
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    xorl %ecx, %ebp
-; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    subl %ecx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %ecx, %ebx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    subl %ecx, %edx
 ; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    sbbl %ecx, %ebx
+; X86-NEXT:    sbbl %ecx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    movl %edx, (%ecx)
+; X86-NEXT:    movl %eax, 4(%ecx)
+; X86-NEXT:    movl %ebx, 8(%ecx)
+; X86-NEXT:    movl %esi, 12(%ecx)
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %esi, (%ecx)
-; X86-NEXT:    movl %eax, 4(%ecx)
-; X86-NEXT:    movl %ebp, 8(%ecx)
-; X86-NEXT:    movl %edx, 12(%ecx)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ebp, %edi
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %edx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    imull %eax, %ebx
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    imull %ebp, %edi
+; X86-NEXT:    imull %esi, %edi
 ; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull %esi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl 40(%ebp), %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    imull %edx, %esi
+; X86-NEXT:    imull %edx, %ebx
 ; X86-NEXT:    mull %edx
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    addl $156, %esp
+; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 58ea70e58028f1..16dc1d6b446cf7 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -174,379 +174,370 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-LABEL: scalar_i128:
 ; X86:       # %bb.0: # %_udiv-special-cases
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $136, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $160, %esp
+; X86-NEXT:    movl 28(%ebp), %ebx
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    orl 36(%ebp), %ecx
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sete %bl
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    sete %cl
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    orl 24(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    orl 20(%ebp), %edx
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    sete %al
-; X86-NEXT:    orb %bl, %al
-; X86-NEXT:    movb %al, (%esp) # 1-byte Spill
-; X86-NEXT:    bsrl %edi, %edx
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl %esi, %ecx
+; X86-NEXT:    bsrl 36(%ebp), %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bsrl %eax, %edx
+; X86-NEXT:    bsrl %edi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl %ebp, %ebp
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl $31, %ebp
-; X86-NEXT:    orl $32, %ebp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    cmovnel %edx, %ebp
-; X86-NEXT:    orl $64, %ebp
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    cmovnel %ecx, %ebp
-; X86-NEXT:    bsrl %esi, %edx
-; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    bsrl %ebx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    cmovnel %edx, %eax
+; X86-NEXT:    orl $64, %eax
+; X86-NEXT:    movl 36(%ebp), %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    bsrl %ecx, %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
-; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 16(%ebp), %edi
 ; X86-NEXT:    bsrl %edi, %esi
 ; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    bsrl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    bsrl 12(%ebp), %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    orl $32, %edx
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %esi, %edx
 ; X86-NEXT:    orl $64, %edx
-; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
-; X86-NEXT:    subl %edx, %ebp
+; X86-NEXT:    subl %edx, %eax
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
 ; X86-NEXT:    movl $127, %ecx
-; X86-NEXT:    cmpl %ebp, %ecx
+; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    orb (%esp), %cl # 1-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    xorl $127, %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %eax
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    testb %cl, %cl
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    cmovnel %edi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovnel %edi, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmovnel %edi, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmovnel %edi, %ebx
-; X86-NEXT:    orb %cl, %al
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb %cl, %ah
+; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    cmovnel %esi, %ecx
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 16(%ebp), %esi
+; X86-NEXT:    cmovnel %edx, %esi
+; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    cmovnel %edx, %ecx
+; X86-NEXT:    orb %ah, %al
+; X86-NEXT:    movl 44(%ebp), %eax
 ; X86-NEXT:    jne .LBB4_7
 ; X86-NEXT:  # %bb.1: # %udiv-bb1
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    xorps %xmm0, %xmm0
+; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    xorb $127, %al
-; X86-NEXT:    movb %al, %ch
-; X86-NEXT:    andb $7, %ch
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    xorb $127, %cl
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $15, %al
+; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 128(%esp,%eax), %edx
-; X86-NEXT:    movl 132(%esp,%eax), %esi
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl 136(%esp,%eax), %edi
+; X86-NEXT:    movl 140(%esp,%eax), %esi
+; X86-NEXT:    shldl %cl, %edi, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    notb %cl
-; X86-NEXT:    movl 124(%esp,%eax), %ebp
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    shrl %esi
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl 120(%esp,%eax), %eax
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shldl %cl, %eax, %ebp
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    addl $1, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl 128(%esp,%eax), %ebx
+; X86-NEXT:    movl 132(%esp,%eax), %eax
+; X86-NEXT:    shldl %cl, %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    shldl %cl, %ebx, %edi
+; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    addl $1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl 20(%ebp), %ebx
 ; X86-NEXT:    jae .LBB4_2
 ; X86-NEXT:  # %bb.5:
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    jmp .LBB4_6
 ; X86-NEXT:  .LBB4_2: # %udiv-preheader
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%ebp), %edx
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl 16(%ebp), %edx
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movb %al, %ch
-; X86-NEXT:    andb $7, %ch
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $15, %al
+; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 84(%esp,%eax), %ebx
+; X86-NEXT:    movl 92(%esp,%eax), %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esp,%eax), %esi
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 72(%esp,%eax), %ebp
-; X86-NEXT:    movl 76(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    notb %cl
-; X86-NEXT:    addl %esi, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    movl 88(%esp,%eax), %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esp,%eax), %edi
+; X86-NEXT:    movl 84(%esp,%eax), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrl %cl, %ebx
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    shrdl %cl, %edx, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %eax
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 32(%ebp), %eax
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    adcl $-1, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB4_3: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebp, %edi
-; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, %ebp
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl $1, %esi, %ebx
+; X86-NEXT:    shldl $1, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    shldl $1, %edi, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $1, %edx, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %edx, %edx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmpl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    shldl $1, %ecx, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl 40(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl %ecx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    andl 36(%ebp), %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl 32(%ebp), %edx
+; X86-NEXT:    andl 28(%ebp), %ecx
+; X86-NEXT:    subl %ecx, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edi, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl %eax, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %ebx, %ecx
-; X86-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    jne .LBB4_3
 ; X86-NEXT:  # %bb.4:
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:  .LBB4_6: # %udiv-loop-exit
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %esi, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    shldl $1, %ebp, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, %ebp
-; X86-NEXT:    orl %ecx, %ebp
-; X86-NEXT:    addl %ebx, %ebx
+; X86-NEXT:  .LBB4_6: # %udiv-loop-exit
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %ebx
 ; X86-NEXT:    orl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shldl $1, %esi, %edi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl 44(%ebp), %eax
 ; X86-NEXT:  .LBB4_7: # %udiv-end
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    imull %ebp, %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    imull %edx, %esi
 ; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    imull %ecx, %ebp
-; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull %esi, %edi
+; X86-NEXT:    movl 40(%ebp), %edi
+; X86-NEXT:    imull %ecx, %edi
 ; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull 28(%ebp), %ebx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    imull %edx, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebp, %ebx
-; X86-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull 32(%ebp)
+; X86-NEXT:    movl 16(%ebp), %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull 32(%ebp)
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %edx
-; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl (%esp), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl 12(%ebp), %ebx
+; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl 24(%ebp), %ecx
 ; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, (%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    addl $136, %esp
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index 6fcebdb5116ddd..fb169a3777fb82 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -22,7 +22,7 @@ define void @f() nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $176, %esp
+; X86-NEXT:    subl $160, %esp
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -96,18 +96,16 @@ define void @f() nounwind {
 ; X86-NEXT:    addl $1, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    andl $3, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movb $65, %cl
 ; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movb %cl, %ch
-; X86-NEXT:    andb $7, %ch
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    negb %cl
-; X86-NEXT:    movsbl %cl, %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %esi
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -117,29 +115,24 @@ define void @f() nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 136(%esp,%esi), %edi
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    notb %cl
-; X86-NEXT:    movl 128(%esp,%esi), %ebx
-; X86-NEXT:    movl 132(%esp,%esi), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    shrl %esi
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shldl %cl, %ebx, %eax
+; X86-NEXT:    movl 112(%esp,%esi), %edi
+; X86-NEXT:    movl 116(%esp,%esi), %eax
+; X86-NEXT:    movl 120(%esp,%esi), %esi
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    je .LBB0_13
 ; X86-NEXT:  # %bb.11: # %udiv-preheader
-; X86-NEXT:    andl $3, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    andl $3, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $3, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -148,26 +141,20 @@ define void @f() nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movb %al, %ch
-; X86-NEXT:    andb $7, %ch
-; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $15, %al
-; X86-NEXT:    movzbl %al, %edx
-; X86-NEXT:    movl 80(%esp,%edx), %edi
-; X86-NEXT:    movl 84(%esp,%edx), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    notb %cl
-; X86-NEXT:    movl 88(%esp,%edx), %ebx
-; X86-NEXT:    addl %ebx, %ebx
-; X86-NEXT:    shll %cl, %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl 72(%esp,%eax), %ebx
+; X86-NEXT:    movl 64(%esp,%eax), %esi
+; X86-NEXT:    movl 68(%esp,%eax), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    shrdl %cl, %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl $-1, %eax
@@ -175,70 +162,69 @@ define void @f() nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $3, %eax
-; X86-NEXT:    andl $3, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $3, %edi
+; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB0_12: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %ebx, %esi
 ; X86-NEXT:    shldl $1, %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    andl $2, %eax
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    leal (%eax,%edi,2), %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    andl $2, %edx
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    leal (%edx,%ebx,2), %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    shldl $1, %edx, %edi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    shldl $1, %eax, %edx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $3, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    sbbl %ecx, %esi
-; X86-NEXT:    shll $30, %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    sarl $30, %eax
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    shrdl $1, %esi, %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andl $1, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    shll $30, %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sarl $30, %edi
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    shrdl $1, %edx, %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    subl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    subl %edi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    andl $3, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl $3, %esi
-; X86-NEXT:    andl $3, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $3, %edi
+; X86-NEXT:    andl $3, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %eax
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    jne .LBB0_12

diff  --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
index df3c25a8c42ad4..6be79edbe51e10 100644
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -13,26 +13,24 @@ define i256 @test1(i256 %a) nounwind {
 ; ILP-LABEL: test1:
 ; ILP:       # %bb.0:
 ; ILP-NEXT:    movq %rdi, %rax
+; ILP-NEXT:    xorps %xmm0, %xmm0
+; ILP-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; ILP-NEXT:    leal (%rsi,%rsi), %ecx
-; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT:    addb $3, %cl
 ; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; ILP-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT:    addb $3, %cl
 ; ILP-NEXT:    movl %ecx, %edx
 ; ILP-NEXT:    shrb $3, %dl
-; ILP-NEXT:    andb $7, %cl
+; ILP-NEXT:    andb $24, %dl
 ; ILP-NEXT:    negb %dl
 ; ILP-NEXT:    movsbq %dl, %rdx
-; ILP-NEXT:    movq -16(%rsp,%rdx), %rsi
-; ILP-NEXT:    movq -8(%rsp,%rdx), %rdi
+; ILP-NEXT:    movq -24(%rsp,%rdx), %rsi
+; ILP-NEXT:    movq -16(%rsp,%rdx), %rdi
 ; ILP-NEXT:    shldq %cl, %rsi, %rdi
-; ILP-NEXT:    movq -32(%rsp,%rdx), %r8
-; ILP-NEXT:    movq -24(%rsp,%rdx), %rdx
+; ILP-NEXT:    movq -40(%rsp,%rdx), %r8
+; ILP-NEXT:    movq -32(%rsp,%rdx), %rdx
 ; ILP-NEXT:    movq %r8, %r9
 ; ILP-NEXT:    shlq %cl, %r9
 ; ILP-NEXT:    movq %rdx, %r10
@@ -52,27 +50,25 @@ define i256 @test1(i256 %a) nounwind {
 ; HYBRID-LABEL: test1:
 ; HYBRID:       # %bb.0:
 ; HYBRID-NEXT:    movq %rdi, %rax
-; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT:    xorps %xmm0, %xmm0
+; HYBRID-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; HYBRID-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT:    addl %esi, %esi
-; HYBRID-NEXT:    addb $3, %sil
-; HYBRID-NEXT:    movl %esi, %ecx
-; HYBRID-NEXT:    andb $7, %cl
-; HYBRID-NEXT:    shrb $3, %sil
-; HYBRID-NEXT:    negb %sil
-; HYBRID-NEXT:    movsbq %sil, %rdx
-; HYBRID-NEXT:    movq -16(%rsp,%rdx), %rsi
-; HYBRID-NEXT:    movq -8(%rsp,%rdx), %rdi
+; HYBRID-NEXT:    leal (%rsi,%rsi), %ecx
+; HYBRID-NEXT:    addb $3, %cl
+; HYBRID-NEXT:    movl %ecx, %edx
+; HYBRID-NEXT:    shrb $3, %dl
+; HYBRID-NEXT:    andb $24, %dl
+; HYBRID-NEXT:    negb %dl
+; HYBRID-NEXT:    movsbq %dl, %rdx
+; HYBRID-NEXT:    movq -24(%rsp,%rdx), %rsi
+; HYBRID-NEXT:    movq -16(%rsp,%rdx), %rdi
 ; HYBRID-NEXT:    shldq %cl, %rsi, %rdi
 ; HYBRID-NEXT:    movq %rdi, 24(%rax)
-; HYBRID-NEXT:    movq -32(%rsp,%rdx), %rdi
-; HYBRID-NEXT:    movq -24(%rsp,%rdx), %rdx
+; HYBRID-NEXT:    movq -40(%rsp,%rdx), %rdi
+; HYBRID-NEXT:    movq -32(%rsp,%rdx), %rdx
 ; HYBRID-NEXT:    movq %rdx, %r8
 ; HYBRID-NEXT:    shldq %cl, %rdi, %r8
 ; HYBRID-NEXT:    movq %r8, 8(%rax)
@@ -81,6 +77,7 @@ define i256 @test1(i256 %a) nounwind {
 ; HYBRID-NEXT:    shlq %cl, %rsi
 ; HYBRID-NEXT:    notb %cl
 ; HYBRID-NEXT:    shrq %rdx
+; HYBRID-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; HYBRID-NEXT:    shrq %cl, %rdx
 ; HYBRID-NEXT:    orq %rsi, %rdx
 ; HYBRID-NEXT:    movq %rdx, 16(%rax)
@@ -89,27 +86,25 @@ define i256 @test1(i256 %a) nounwind {
 ; BURR-LABEL: test1:
 ; BURR:       # %bb.0:
 ; BURR-NEXT:    movq %rdi, %rax
-; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; BURR-NEXT:    xorps %xmm0, %xmm0
+; BURR-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; BURR-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; BURR-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; BURR-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT:    addl %esi, %esi
-; BURR-NEXT:    addb $3, %sil
-; BURR-NEXT:    movl %esi, %ecx
-; BURR-NEXT:    andb $7, %cl
-; BURR-NEXT:    shrb $3, %sil
-; BURR-NEXT:    negb %sil
-; BURR-NEXT:    movsbq %sil, %rdx
-; BURR-NEXT:    movq -16(%rsp,%rdx), %rsi
-; BURR-NEXT:    movq -8(%rsp,%rdx), %rdi
+; BURR-NEXT:    leal (%rsi,%rsi), %ecx
+; BURR-NEXT:    addb $3, %cl
+; BURR-NEXT:    movl %ecx, %edx
+; BURR-NEXT:    shrb $3, %dl
+; BURR-NEXT:    andb $24, %dl
+; BURR-NEXT:    negb %dl
+; BURR-NEXT:    movsbq %dl, %rdx
+; BURR-NEXT:    movq -24(%rsp,%rdx), %rsi
+; BURR-NEXT:    movq -16(%rsp,%rdx), %rdi
 ; BURR-NEXT:    shldq %cl, %rsi, %rdi
 ; BURR-NEXT:    movq %rdi, 24(%rax)
-; BURR-NEXT:    movq -32(%rsp,%rdx), %rdi
-; BURR-NEXT:    movq -24(%rsp,%rdx), %rdx
+; BURR-NEXT:    movq -40(%rsp,%rdx), %rdi
+; BURR-NEXT:    movq -32(%rsp,%rdx), %rdx
 ; BURR-NEXT:    movq %rdx, %r8
 ; BURR-NEXT:    shldq %cl, %rdi, %r8
 ; BURR-NEXT:    movq %r8, 8(%rax)
@@ -118,6 +113,7 @@ define i256 @test1(i256 %a) nounwind {
 ; BURR-NEXT:    shlq %cl, %rsi
 ; BURR-NEXT:    notb %cl
 ; BURR-NEXT:    shrq %rdx
+; BURR-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; BURR-NEXT:    shrq %cl, %rdx
 ; BURR-NEXT:    orq %rsi, %rdx
 ; BURR-NEXT:    movq %rdx, 16(%rax)
@@ -126,33 +122,31 @@ define i256 @test1(i256 %a) nounwind {
 ; SRC-LABEL: test1:
 ; SRC:       # %bb.0:
 ; SRC-NEXT:    movq %rdi, %rax
-; SRC-NEXT:    addl %esi, %esi
-; SRC-NEXT:    addb $3, %sil
-; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT:    leal (%rsi,%rsi), %edx
+; SRC-NEXT:    addb $3, %dl
+; SRC-NEXT:    xorps %xmm0, %xmm0
+; SRC-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; SRC-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT:    movl %esi, %edx
-; SRC-NEXT:    andb $7, %dl
-; SRC-NEXT:    shrb $3, %sil
-; SRC-NEXT:    negb %sil
-; SRC-NEXT:    movsbq %sil, %rsi
-; SRC-NEXT:    movq -16(%rsp,%rsi), %rdi
+; SRC-NEXT:    movl %edx, %ecx
+; SRC-NEXT:    shrb $3, %cl
+; SRC-NEXT:    andb $24, %cl
+; SRC-NEXT:    negb %cl
+; SRC-NEXT:    movsbq %cl, %rsi
+; SRC-NEXT:    movq -24(%rsp,%rsi), %rdi
 ; SRC-NEXT:    movq %rdi, %r8
 ; SRC-NEXT:    movl %edx, %ecx
 ; SRC-NEXT:    shlq %cl, %r8
 ; SRC-NEXT:    notb %cl
-; SRC-NEXT:    movq -32(%rsp,%rsi), %r9
-; SRC-NEXT:    movq -24(%rsp,%rsi), %r10
+; SRC-NEXT:    movq -40(%rsp,%rsi), %r9
+; SRC-NEXT:    movq -32(%rsp,%rsi), %r10
 ; SRC-NEXT:    movq %r10, %r11
 ; SRC-NEXT:    shrq %r11
 ; SRC-NEXT:    shrq %cl, %r11
 ; SRC-NEXT:    orq %r8, %r11
-; SRC-NEXT:    movq -8(%rsp,%rsi), %rsi
+; SRC-NEXT:    movq -16(%rsp,%rsi), %rsi
 ; SRC-NEXT:    movl %edx, %ecx
 ; SRC-NEXT:    shldq %cl, %rdi, %rsi
 ; SRC-NEXT:    movq %r9, %rdi
@@ -171,27 +165,25 @@ define i256 @test1(i256 %a) nounwind {
 ; LIN-NEXT:    addb $3, %dl
 ; LIN-NEXT:    movl %edx, %ecx
 ; LIN-NEXT:    shrb $3, %cl
+; LIN-NEXT:    andb $24, %cl
 ; LIN-NEXT:    negb %cl
 ; LIN-NEXT:    movsbq %cl, %rsi
-; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT:    xorps %xmm0, %xmm0
+; LIN-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; LIN-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
 ; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT:    movq -32(%rsp,%rsi), %rdi
-; LIN-NEXT:    andb $7, %dl
+; LIN-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT:    movq -40(%rsp,%rsi), %rdi
 ; LIN-NEXT:    movq %rdi, %r8
 ; LIN-NEXT:    movl %edx, %ecx
 ; LIN-NEXT:    shlq %cl, %r8
 ; LIN-NEXT:    movq %r8, (%rax)
-; LIN-NEXT:    movq -24(%rsp,%rsi), %r8
+; LIN-NEXT:    movq -32(%rsp,%rsi), %r8
 ; LIN-NEXT:    movq %r8, %r9
 ; LIN-NEXT:    shldq %cl, %rdi, %r9
 ; LIN-NEXT:    movq %r9, 8(%rax)
-; LIN-NEXT:    movq -16(%rsp,%rsi), %rdi
+; LIN-NEXT:    movq -24(%rsp,%rsi), %rdi
 ; LIN-NEXT:    movq %rdi, %r9
 ; LIN-NEXT:    shlq %cl, %r9
 ; LIN-NEXT:    shrq %r8
@@ -199,7 +191,7 @@ define i256 @test1(i256 %a) nounwind {
 ; LIN-NEXT:    shrq %cl, %r8
 ; LIN-NEXT:    orq %r9, %r8
 ; LIN-NEXT:    movq %r8, 16(%rax)
-; LIN-NEXT:    movq -8(%rsp,%rsi), %rsi
+; LIN-NEXT:    movq -16(%rsp,%rsi), %rsi
 ; LIN-NEXT:    movl %edx, %ecx
 ; LIN-NEXT:    shldq %cl, %rdi, %rsi
 ; LIN-NEXT:    movq %rsi, 24(%rax)

diff  --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 4fbe05cd1b2f2f..767bd772ab7a3e 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -10,49 +10,45 @@ define void @test_lshr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
 ; i686-LABEL: test_lshr_i128:
 ; i686:       # %bb.0: # %entry
 ; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $32, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    subl $48, %esp
+; i686-NEXT:    movl 24(%ebp), %ecx
+; i686-NEXT:    movl 8(%ebp), %eax
+; i686-NEXT:    movl 12(%ebp), %edx
+; i686-NEXT:    movl 16(%ebp), %esi
+; i686-NEXT:    movl 20(%ebp), %edi
 ; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, (%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %ecx, %eax
-; i686-NEXT:    andb $7, %al
-; i686-NEXT:    shrb $3, %cl
-; i686-NEXT:    andb $15, %cl
-; i686-NEXT:    movzbl %cl, %ebp
-; i686-NEXT:    movl 4(%esp,%ebp), %edx
-; i686-NEXT:    movl %edx, %esi
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrl %cl, %esi
-; i686-NEXT:    notb %cl
-; i686-NEXT:    movl 8(%esp,%ebp), %ebx
-; i686-NEXT:    leal (%ebx,%ebx), %edi
-; i686-NEXT:    shll %cl, %edi
-; i686-NEXT:    orl %esi, %edi
-; i686-NEXT:    movl (%esp,%ebp), %esi
-; i686-NEXT:    movl 12(%esp,%ebp), %ebp
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrdl %cl, %ebp, %ebx
-; i686-NEXT:    shrdl %cl, %edx, %esi
-; i686-NEXT:    shrl %cl, %ebp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl %ebp, 12(%eax)
-; i686-NEXT:    movl %ebx, 8(%eax)
-; i686-NEXT:    movl %esi, (%eax)
-; i686-NEXT:    movl %edi, 4(%eax)
-; i686-NEXT:    addl $32, %esp
+; i686-NEXT:    shrb $3, %al
+; i686-NEXT:    andb $12, %al
+; i686-NEXT:    movzbl %al, %edi
+; i686-NEXT:    movl 8(%esp,%edi), %eax
+; i686-NEXT:    movl 4(%esp,%edi), %ebx
+; i686-NEXT:    movl %ebx, %edx
+; i686-NEXT:    shrdl %cl, %eax, %edx
+; i686-NEXT:    movl (%esp,%edi), %esi
+; i686-NEXT:    movl 12(%esp,%edi), %edi
+; i686-NEXT:    shrdl %cl, %edi, %eax
+; i686-NEXT:    shrdl %cl, %ebx, %esi
+; i686-NEXT:    movl 40(%ebp), %ebx
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shrl %cl, %edi
+; i686-NEXT:    movl %edi, 12(%ebx)
+; i686-NEXT:    movl %eax, 8(%ebx)
+; i686-NEXT:    movl %edx, 4(%ebx)
+; i686-NEXT:    movl %esi, (%ebx)
+; i686-NEXT:    leal -12(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
@@ -81,50 +77,46 @@ define void @test_ashr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
 ; i686-LABEL: test_ashr_i128:
 ; i686:       # %bb.0: # %entry
 ; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $32, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    subl $48, %esp
+; i686-NEXT:    movl 24(%ebp), %ecx
+; i686-NEXT:    movl 8(%ebp), %eax
+; i686-NEXT:    movl 12(%ebp), %edx
+; i686-NEXT:    movl 16(%ebp), %esi
+; i686-NEXT:    movl 20(%ebp), %edi
 ; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, (%esp)
-; i686-NEXT:    sarl $31, %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    sarl $31, %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %ecx, %eax
-; i686-NEXT:    andb $7, %al
-; i686-NEXT:    shrb $3, %cl
-; i686-NEXT:    andb $15, %cl
-; i686-NEXT:    movzbl %cl, %ebp
-; i686-NEXT:    movl 4(%esp,%ebp), %edx
-; i686-NEXT:    movl %edx, %esi
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrl %cl, %esi
-; i686-NEXT:    notb %cl
-; i686-NEXT:    movl 8(%esp,%ebp), %ebx
-; i686-NEXT:    leal (%ebx,%ebx), %edi
-; i686-NEXT:    shll %cl, %edi
-; i686-NEXT:    orl %esi, %edi
-; i686-NEXT:    movl (%esp,%ebp), %esi
-; i686-NEXT:    movl 12(%esp,%ebp), %ebp
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrdl %cl, %ebp, %ebx
-; i686-NEXT:    shrdl %cl, %edx, %esi
-; i686-NEXT:    sarl %cl, %ebp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl %ebp, 12(%eax)
-; i686-NEXT:    movl %ebx, 8(%eax)
-; i686-NEXT:    movl %esi, (%eax)
-; i686-NEXT:    movl %edi, 4(%eax)
-; i686-NEXT:    addl $32, %esp
+; i686-NEXT:    shrb $3, %al
+; i686-NEXT:    andb $12, %al
+; i686-NEXT:    movzbl %al, %edi
+; i686-NEXT:    movl 8(%esp,%edi), %eax
+; i686-NEXT:    movl 4(%esp,%edi), %ebx
+; i686-NEXT:    movl %ebx, %edx
+; i686-NEXT:    shrdl %cl, %eax, %edx
+; i686-NEXT:    movl (%esp,%edi), %esi
+; i686-NEXT:    movl 12(%esp,%edi), %edi
+; i686-NEXT:    shrdl %cl, %edi, %eax
+; i686-NEXT:    shrdl %cl, %ebx, %esi
+; i686-NEXT:    movl 40(%ebp), %ebx
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    sarl %cl, %edi
+; i686-NEXT:    movl %edi, 12(%ebx)
+; i686-NEXT:    movl %eax, 8(%ebx)
+; i686-NEXT:    movl %edx, 4(%ebx)
+; i686-NEXT:    movl %esi, (%ebx)
+; i686-NEXT:    leal -12(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
@@ -154,15 +146,17 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
 ; i686-LABEL: test_shl_i128:
 ; i686:       # %bb.0: # %entry
 ; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $32, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    subl $48, %esp
+; i686-NEXT:    movl 24(%ebp), %ecx
+; i686-NEXT:    movl 8(%ebp), %eax
+; i686-NEXT:    movl 12(%ebp), %edx
+; i686-NEXT:    movl 16(%ebp), %esi
+; i686-NEXT:    movl 20(%ebp), %edi
 ; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
@@ -172,36 +166,27 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, (%esp)
 ; i686-NEXT:    movl %ecx, %eax
-; i686-NEXT:    andb $7, %al
-; i686-NEXT:    shrb $3, %cl
-; i686-NEXT:    andb $15, %cl
-; i686-NEXT:    negb %cl
-; i686-NEXT:    movsbl %cl, %ebp
-; i686-NEXT:    movl 24(%esp,%ebp), %ebx
-; i686-NEXT:    movl %ebx, %edx
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shll %cl, %edx
-; i686-NEXT:    notb %cl
-; i686-NEXT:    movl 20(%esp,%ebp), %edi
-; i686-NEXT:    movl %edi, %esi
-; i686-NEXT:    shrl %esi
-; i686-NEXT:    shrl %cl, %esi
-; i686-NEXT:    orl %edx, %esi
-; i686-NEXT:    movl 16(%esp,%ebp), %edx
-; i686-NEXT:    movl 28(%esp,%ebp), %ebp
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shldl %cl, %ebx, %ebp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl %ebp, 12(%ecx)
-; i686-NEXT:    movl %edx, %ebx
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shll %cl, %ebx
-; i686-NEXT:    shldl %cl, %edx, %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl %edi, 4(%eax)
-; i686-NEXT:    movl %ebx, (%eax)
-; i686-NEXT:    movl %esi, 8(%eax)
-; i686-NEXT:    addl $32, %esp
+; i686-NEXT:    shrb $3, %al
+; i686-NEXT:    andb $12, %al
+; i686-NEXT:    negb %al
+; i686-NEXT:    movsbl %al, %edi
+; i686-NEXT:    movl 20(%esp,%edi), %eax
+; i686-NEXT:    movl 24(%esp,%edi), %ebx
+; i686-NEXT:    movl %ebx, %esi
+; i686-NEXT:    shldl %cl, %eax, %esi
+; i686-NEXT:    movl 16(%esp,%edi), %edx
+; i686-NEXT:    movl 28(%esp,%edi), %edi
+; i686-NEXT:    shldl %cl, %ebx, %edi
+; i686-NEXT:    movl 40(%ebp), %ebx
+; i686-NEXT:    movl %edi, 12(%ebx)
+; i686-NEXT:    movl %esi, 8(%ebx)
+; i686-NEXT:    movl %edx, %esi
+; i686-NEXT:    shll %cl, %esi
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shldl %cl, %edx, %eax
+; i686-NEXT:    movl %eax, 4(%ebx)
+; i686-NEXT:    movl %esi, (%ebx)
+; i686-NEXT:    leal -12(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
@@ -264,104 +249,93 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-LABEL: test_lshr_v2i128:
 ; i686:       # %bb.0: # %entry
 ; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $100, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    subl $112, %esp
+; i686-NEXT:    movl 40(%ebp), %edx
+; i686-NEXT:    movl 24(%ebp), %eax
+; i686-NEXT:    movl 28(%ebp), %ecx
+; i686-NEXT:    movl 32(%ebp), %esi
+; i686-NEXT:    movl 20(%ebp), %edi
 ; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 16(%ebp), %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 12(%ebp), %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 8(%ebp), %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 36(%ebp), %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %esi, %ecx
-; i686-NEXT:    andl $7, %ecx
+; i686-NEXT:    movl %edx, %ebx
+; i686-NEXT:    andl $31, %ebx
+; i686-NEXT:    shrl $3, %edx
+; i686-NEXT:    andl $12, %edx
+; i686-NEXT:    movl 40(%esp,%edx), %eax
+; i686-NEXT:    movl 36(%esp,%edx), %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl %ebx, %ecx
+; i686-NEXT:    shrdl %cl, %eax, %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 32(%esp,%edx), %ecx
 ; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shrl $3, %esi
-; i686-NEXT:    andl $15, %esi
-; i686-NEXT:    movl 40(%esp,%esi), %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shrl %cl, %eax
-; i686-NEXT:    notl %ecx
-; i686-NEXT:    movl 44(%esp,%esi), %edx
-; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    addl %edx, %edx
-; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shll %cl, %edx
-; i686-NEXT:    orl %eax, %edx
+; i686-NEXT:    movl 44(%esp,%edx), %edx
 ; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 36(%esp,%esi), %eax
+; i686-NEXT:    movl %ebx, %ecx
+; i686-NEXT:    movl %ebx, %esi
+; i686-NEXT:    shrdl %cl, %edx, %eax
 ; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, %edx
-; i686-NEXT:    andl $7, %edx
-; i686-NEXT:    shrl $3, %ebx
-; i686-NEXT:    andl $15, %ebx
-; i686-NEXT:    movl 72(%esp,%ebx), %ebp
-; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    shrl %cl, %ebp
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    notl %ecx
-; i686-NEXT:    movl 76(%esp,%ebx), %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    leal (%eax,%eax), %edi
-; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shll %cl, %edi
-; i686-NEXT:    orl %ebp, %edi
-; i686-NEXT:    movl 48(%esp,%esi), %esi
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    movl 56(%ebp), %edx
+; i686-NEXT:    movl %edx, %eax
+; i686-NEXT:    andl $31, %eax
+; i686-NEXT:    shrl $3, %edx
+; i686-NEXT:    andl $12, %edx
+; i686-NEXT:    movl 72(%esp,%edx), %ebx
+; i686-NEXT:    movl 68(%esp,%edx), %edi
+; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl %eax, %ecx
+; i686-NEXT:    shrdl %cl, %ebx, %edi
+; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 64(%esp,%edx), %edi
+; i686-NEXT:    movl 76(%esp,%edx), %edx
+; i686-NEXT:    shrdl %cl, %edx, %ebx
+; i686-NEXT:    movl %esi, %ecx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; i686-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    movl 68(%esp,%ebx), %ecx
-; i686-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; i686-NEXT:    movl 80(%esp,%ebx), %esi
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %esi, %ebx
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT:    shrl %cl, %ebp
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %eax, (%esp) # 4-byte Folded Spill
-; i686-NEXT:    shrl %cl, %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl %esi, 28(%ecx)
-; i686-NEXT:    movl %ebx, 24(%ecx)
-; i686-NEXT:    movl (%esp), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, 16(%ecx)
-; i686-NEXT:    movl %ebp, 12(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, 8(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, (%ecx)
-; i686-NEXT:    movl %edi, 20(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, 4(%ecx)
-; i686-NEXT:    addl $100, %esp
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; i686-NEXT:    shrdl %cl, %esi, %edi
+; i686-NEXT:    shrl %cl, %edx
+; i686-NEXT:    movl 72(%ebp), %eax
+; i686-NEXT:    movl %edx, 28(%eax)
+; i686-NEXT:    movl %ebx, 24(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 20(%eax)
+; i686-NEXT:    movl %edi, 16(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 12(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 8(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 4(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, (%eax)
+; i686-NEXT:    leal -12(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
@@ -402,107 +376,96 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-LABEL: test_ashr_v2i128:
 ; i686:       # %bb.0: # %entry
 ; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $92, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    sarl $31, %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    subl $112, %esp
+; i686-NEXT:    movl 40(%ebp), %edx
+; i686-NEXT:    movl 24(%ebp), %eax
+; i686-NEXT:    movl 28(%ebp), %ecx
+; i686-NEXT:    movl 32(%ebp), %esi
+; i686-NEXT:    movl 16(%ebp), %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 12(%ebp), %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 8(%ebp), %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 20(%ebp), %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    sarl $31, %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 36(%ebp), %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; i686-NEXT:    sarl $31, %eax
-; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, %ebx
-; i686-NEXT:    andl $7, %ebx
-; i686-NEXT:    shrl $3, %edi
-; i686-NEXT:    andl $15, %edi
-; i686-NEXT:    movl 32(%esp,%edi), %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    shrl %cl, %eax
+; i686-NEXT:    sarl $31, %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edx, %eax
+; i686-NEXT:    andl $31, %eax
+; i686-NEXT:    shrl $3, %edx
+; i686-NEXT:    andl $12, %edx
+; i686-NEXT:    movl 40(%esp,%edx), %esi
+; i686-NEXT:    movl 36(%esp,%edx), %edi
+; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl %eax, %ecx
+; i686-NEXT:    shrdl %cl, %esi, %edi
+; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 32(%esp,%edx), %ecx
+; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 44(%esp,%edx), %edx
+; i686-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; i686-NEXT:    movl %eax, %ecx
+; i686-NEXT:    shrdl %cl, %edx, %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 56(%ebp), %edx
+; i686-NEXT:    movl %edx, %ebx
+; i686-NEXT:    andl $31, %ebx
+; i686-NEXT:    shrl $3, %edx
+; i686-NEXT:    andl $12, %edx
+; i686-NEXT:    movl 72(%esp,%edx), %esi
+; i686-NEXT:    movl 68(%esp,%edx), %edi
+; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    notl %ecx
-; i686-NEXT:    movl 36(%esp,%edi), %edx
-; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    addl %edx, %edx
-; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shll %cl, %edx
-; i686-NEXT:    orl %eax, %edx
-; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebp, %eax
-; i686-NEXT:    movl %ebp, %edx
-; i686-NEXT:    andl $7, %edx
-; i686-NEXT:    shrl $3, %eax
-; i686-NEXT:    andl $15, %eax
-; i686-NEXT:    movl 64(%esp,%eax), %ebp
-; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    shrl %cl, %ebp
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    notl %ecx
-; i686-NEXT:    movl 68(%esp,%eax), %esi
-; i686-NEXT:    leal (%esi,%esi), %eax
-; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shll %cl, %eax
-; i686-NEXT:    orl %ebp, %eax
-; i686-NEXT:    movl 28(%esp,%edi), %ecx
+; i686-NEXT:    shrdl %cl, %esi, %edi
+; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 64(%esp,%edx), %ecx
 ; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 40(%esp,%edi), %edi
+; i686-NEXT:    movl 76(%esp,%edx), %edx
 ; i686-NEXT:    movl %ebx, %ecx
+; i686-NEXT:    shrdl %cl, %edx, %esi
+; i686-NEXT:    movl %eax, %ecx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; i686-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; i686-NEXT:    movl 60(%esp,%ecx), %ebp
-; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 72(%esp,%ecx), %ebp
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    shrdl %cl, %ebp, %esi
-; i686-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; i686-NEXT:    sarl %cl, (%esp) # 4-byte Folded Spill
 ; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    sarl %cl, %edi
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %esi, %ebx
-; i686-NEXT:    sarl %cl, %ebp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl %ebp, 28(%ecx)
-; i686-NEXT:    movl (%esp), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, 24(%ecx)
-; i686-NEXT:    movl %ebx, 16(%ecx)
-; i686-NEXT:    movl %edi, 12(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, 8(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, (%ecx)
-; i686-NEXT:    movl %eax, 20(%ecx)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, 4(%ecx)
-; i686-NEXT:    addl $92, %esp
+; i686-NEXT:    shrdl %cl, %eax, %edi
+; i686-NEXT:    sarl %cl, %edx
+; i686-NEXT:    movl 72(%ebp), %eax
+; i686-NEXT:    movl %edx, 28(%eax)
+; i686-NEXT:    movl %esi, 24(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 20(%eax)
+; i686-NEXT:    movl %edi, 16(%eax)
+; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 12(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 8(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 4(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, (%eax)
+; i686-NEXT:    leal -12(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
@@ -546,112 +509,106 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686-LABEL: test_shl_v2i128:
 ; i686:       # %bb.0: # %entry
 ; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $100, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    subl $128, %esp
+; i686-NEXT:    movl 40(%ebp), %edi
+; i686-NEXT:    movl 24(%ebp), %eax
+; i686-NEXT:    movl 28(%ebp), %ecx
+; i686-NEXT:    movl 32(%ebp), %edx
+; i686-NEXT:    movl 20(%ebp), %esi
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 16(%ebp), %esi
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 12(%ebp), %esi
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 8(%ebp), %esi
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 36(%ebp), %esi
 ; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebp, %ecx
-; i686-NEXT:    shrl $3, %ebp
-; i686-NEXT:    andl $15, %ebp
+; i686-NEXT:    movl %edi, %ebx
+; i686-NEXT:    shrl $3, %ebx
+; i686-NEXT:    andl $12, %ebx
 ; i686-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    subl %ebp, %eax
+; i686-NEXT:    subl %ebx, %eax
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 8(%eax), %edx
-; i686-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; i686-NEXT:    andl $7, %ecx
+; i686-NEXT:    movl (%eax), %esi
+; i686-NEXT:    movl 4(%eax), %edx
+; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 8(%eax), %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl %edi, %ecx
+; i686-NEXT:    andl $31, %ecx
 ; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shll %cl, %edx
-; i686-NEXT:    movl 4(%eax), %esi
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shrl %esi
-; i686-NEXT:    notl %ecx
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shrl %cl, %esi
-; i686-NEXT:    orl %edx, %esi
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl (%eax), %eax
+; i686-NEXT:    shldl %cl, %edx, %eax
 ; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebx, %edx
+; i686-NEXT:    movl 56(%ebp), %eax
+; i686-NEXT:    movl %eax, %edx
 ; i686-NEXT:    shrl $3, %edx
-; i686-NEXT:    andl $15, %edx
-; i686-NEXT:    leal {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    subl %edx, %esi
+; i686-NEXT:    andl $12, %edx
+; i686-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; i686-NEXT:    subl %edx, %ecx
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    andl $7, %ebx
-; i686-NEXT:    movl 8(%esi), %edi
+; i686-NEXT:    movl (%ecx), %edi
 ; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    shll %cl, %edi
-; i686-NEXT:    movl 4(%esi), %eax
+; i686-NEXT:    movl 4(%ecx), %edi
+; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 8(%ecx), %ecx
+; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    andl $31, %eax
 ; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shrl %eax
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    notl %ecx
+; i686-NEXT:    movl %ecx, %eax
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shrl %cl, %eax
-; i686-NEXT:    orl %edi, %eax
-; i686-NEXT:    movl (%esi), %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    movl %esi, %edi
+; i686-NEXT:    shldl %cl, %edi, %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl %esi, %eax
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    shll %cl, %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    shll %cl, %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    negl %ebp
-; i686-NEXT:    movl 64(%esp,%ebp), %esi
-; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    movl (%esp), %edi # 4-byte Reload
-; i686-NEXT:    shldl %cl, %edi, %esi
-; i686-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; i686-NEXT:    negl %ebx
+; i686-NEXT:    movl 76(%esp,%ebx), %ebx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    movl %esi, %edi
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    shll %cl, %edi
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT:    shldl %cl, %esi, %ebp
+; i686-NEXT:    shldl %cl, %esi, %ebx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; i686-NEXT:    movl %edi, %esi
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    movl %eax, %ecx
+; i686-NEXT:    shll %cl, %esi
+; i686-NEXT:    shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; i686-NEXT:    negl %edx
-; i686-NEXT:    movl 96(%esp,%edx), %edx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT:    shldl %cl, %ebx, %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl %edx, 28(%ecx)
-; i686-NEXT:    movl %ebp, 20(%ecx)
-; i686-NEXT:    movl %edi, 16(%ecx)
-; i686-NEXT:    movl (%esp), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, 12(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, 4(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, (%ecx)
-; i686-NEXT:    movl %eax, 24(%ecx)
+; i686-NEXT:    movl 108(%esp,%edx), %edx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, 8(%ecx)
-; i686-NEXT:    addl $100, %esp
+; i686-NEXT:    shldl %cl, %eax, %edx
+; i686-NEXT:    movl 72(%ebp), %eax
+; i686-NEXT:    movl %edx, 28(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 24(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 20(%eax)
+; i686-NEXT:    movl %esi, 16(%eax)
+; i686-NEXT:    movl %ebx, 12(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 8(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 4(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, (%eax)
+; i686-NEXT:    leal -12(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll
index e1466aebf42258..128e2199fb56f6 100644
--- a/llvm/test/CodeGen/X86/shift-i256.ll
+++ b/llvm/test/CodeGen/X86/shift-i256.ll
@@ -8,98 +8,78 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
 ; CHECK-LABEL: shift1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
 ; CHECK-NEXT:    pushl %ebx
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    subl $92, %esp
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    andl $-16, %esp
+; CHECK-NEXT:    subl $112, %esp
+; CHECK-NEXT:    movl 40(%ebp), %ecx
+; CHECK-NEXT:    movl 8(%ebp), %eax
+; CHECK-NEXT:    movl 12(%ebp), %edx
+; CHECK-NEXT:    movl 16(%ebp), %esi
+; CHECK-NEXT:    movl 32(%ebp), %edi
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl 28(%ebp), %edi
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl 24(%ebp), %edi
 ; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl 20(%ebp), %edi
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl 36(%ebp), %edi
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    sarl $31, %esi
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    sarl $31, %edi
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %ecx, %eax
-; CHECK-NEXT:    andb $7, %al
-; CHECK-NEXT:    shrb $3, %cl
-; CHECK-NEXT:    movzbl %cl, %ebp
-; CHECK-NEXT:    movl 32(%esp,%ebp), %esi
+; CHECK-NEXT:    shrb $5, %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    movl 40(%esp,%eax,4), %edx
+; CHECK-NEXT:    movl 36(%esp,%eax,4), %esi
 ; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    shrl %cl, %esi
-; CHECK-NEXT:    movl %eax, %edx
-; CHECK-NEXT:    notb %dl
-; CHECK-NEXT:    movl 36(%esp,%ebp), %ecx
-; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    leal (%ecx,%ecx), %edi
-; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    shll %cl, %edi
-; CHECK-NEXT:    orl %esi, %edi
-; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 40(%esp,%ebp), %esi
+; CHECK-NEXT:    shrdl %cl, %edx, %esi
 ; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    shrl %cl, %esi
-; CHECK-NEXT:    movl 44(%esp,%ebp), %ecx
-; CHECK-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; CHECK-NEXT:    leal (%ecx,%ecx), %edi
-; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    shll %cl, %edi
-; CHECK-NEXT:    orl %esi, %edi
+; CHECK-NEXT:    movl 44(%esp,%eax,4), %esi
+; CHECK-NEXT:    shrdl %cl, %esi, %edx
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl 48(%esp,%eax,4), %ebx
+; CHECK-NEXT:    shrdl %cl, %ebx, %esi
+; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl 52(%esp,%eax,4), %esi
+; CHECK-NEXT:    shrdl %cl, %esi, %ebx
+; CHECK-NEXT:    movl 56(%esp,%eax,4), %edx
+; CHECK-NEXT:    shrdl %cl, %edx, %esi
+; CHECK-NEXT:    movl 32(%esp,%eax,4), %edi
 ; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 48(%esp,%ebp), %ebx
-; CHECK-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    shrl %cl, %ebx
-; CHECK-NEXT:    movl 52(%esp,%ebp), %edi
-; CHECK-NEXT:    leal (%edi,%edi), %esi
-; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    shll %cl, %esi
-; CHECK-NEXT:    orl %ebx, %esi
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; CHECK-NEXT:    movl 28(%esp,%ebp), %edx
-; CHECK-NEXT:    movl 56(%esp,%ebp), %ebx
-; CHECK-NEXT:    shrdl %cl, %ebx, %edi
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; CHECK-NEXT:    shrdl %cl, %ebp, %edx
-; CHECK-NEXT:    sarl %cl, %ebx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl %ebx, 28(%eax)
-; CHECK-NEXT:    movl %edi, 24(%eax)
-; CHECK-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, 16(%eax)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, 8(%eax)
-; CHECK-NEXT:    movl %edx, (%eax)
-; CHECK-NEXT:    movl %esi, 20(%eax)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, 12(%eax)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, 4(%eax)
-; CHECK-NEXT:    addl $92, %esp
+; CHECK-NEXT:    movl 60(%esp,%eax,4), %eax
+; CHECK-NEXT:    shrdl %cl, %eax, %edx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    sarl %cl, %eax
+; CHECK-NEXT:    movl 72(%ebp), %ecx
+; CHECK-NEXT:    movl %eax, 28(%ecx)
+; CHECK-NEXT:    movl %edx, 24(%ecx)
+; CHECK-NEXT:    movl %esi, 20(%ecx)
+; CHECK-NEXT:    movl %ebx, 16(%ecx)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT:    movl %eax, 12(%ecx)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT:    movl %eax, 8(%ecx)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT:    movl %eax, 4(%ecx)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT:    movl %eax, (%ecx)
+; CHECK-NEXT:    leal -12(%ebp), %esp
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    popl %ebx
@@ -120,42 +100,35 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
 ; CHECK-X64-O0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT:    movb %r8b, %dl
-; CHECK-X64-O0-NEXT:    movb %dl, %cl
-; CHECK-X64-O0-NEXT:    andb $7, %cl
+; CHECK-X64-O0-NEXT:    movb %r8b, %cl
 ; CHECK-X64-O0-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-X64-O0-NEXT:    shrb $3, %dl
+; CHECK-X64-O0-NEXT:    movb %cl, %dl
+; CHECK-X64-O0-NEXT:    shrb $6, %dl
 ; CHECK-X64-O0-NEXT:    movzbl %dl, %edx
 ; CHECK-X64-O0-NEXT:    movl %edx, %edi
-; CHECK-X64-O0-NEXT:    movq -64(%rsp,%rdi), %rdx
-; CHECK-X64-O0-NEXT:    movq -56(%rsp,%rdi), %r8
-; CHECK-X64-O0-NEXT:    movq %r8, %r9
-; CHECK-X64-O0-NEXT:    shrq %cl, %r9
-; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT:    notb %cl
-; CHECK-X64-O0-NEXT:    movq -48(%rsp,%rdi), %rsi
-; CHECK-X64-O0-NEXT:    movq %rsi, %r10
-; CHECK-X64-O0-NEXT:    addq %r10, %r10
-; CHECK-X64-O0-NEXT:    shlq %cl, %r10
+; CHECK-X64-O0-NEXT:    movq -56(%rsp,%rdi,8), %rsi
+; CHECK-X64-O0-NEXT:    movq -72(%rsp,%rdi,8), %r8
+; CHECK-X64-O0-NEXT:    movq -64(%rsp,%rdi,8), %r9
+; CHECK-X64-O0-NEXT:    movq %r9, %rdx
+; CHECK-X64-O0-NEXT:    shrdq %cl, %rsi, %rdx
 ; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT:    orq %r10, %r9
-; CHECK-X64-O0-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-X64-O0-NEXT:    movq -40(%rsp,%rdi), %rdi
+; CHECK-X64-O0-NEXT:    movq -48(%rsp,%rdi,8), %rdi
 ; CHECK-X64-O0-NEXT:    shrdq %cl, %rdi, %rsi
 ; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT:    shrdq %cl, %r8, %rdx
+; CHECK-X64-O0-NEXT:    shrdq %cl, %r9, %r8
 ; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-X64-O0-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-X64-O0-NEXT:    sarq %cl, %rdi
 ; CHECK-X64-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; CHECK-X64-O0-NEXT:    movq %rdi, 24(%rax)
 ; CHECK-X64-O0-NEXT:    movq %rsi, 16(%rax)
-; CHECK-X64-O0-NEXT:    movq %rdx, (%rax)
-; CHECK-X64-O0-NEXT:    movq %rcx, 8(%rax)
+; CHECK-X64-O0-NEXT:    movq %rdx, 8(%rax)
+; CHECK-X64-O0-NEXT:    movq %rcx, (%rax)
 ; CHECK-X64-O0-NEXT:    retq
 ;
 ; CHECK-X64-O2-LABEL: shift1:
 ; CHECK-X64-O2:       # %bb.0: # %entry
-; CHECK-X64-O2-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; CHECK-X64-O2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; CHECK-X64-O2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
@@ -165,29 +138,23 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
 ; CHECK-X64-O2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movl %r8d, %eax
-; CHECK-X64-O2-NEXT:    andb $7, %al
-; CHECK-X64-O2-NEXT:    shrb $3, %r8b
-; CHECK-X64-O2-NEXT:    movzbl %r8b, %edx
-; CHECK-X64-O2-NEXT:    movq -64(%rsp,%rdx), %rsi
-; CHECK-X64-O2-NEXT:    movq -56(%rsp,%rdx), %rdi
-; CHECK-X64-O2-NEXT:    movq %rdi, %r8
-; CHECK-X64-O2-NEXT:    movl %eax, %ecx
-; CHECK-X64-O2-NEXT:    shrq %cl, %r8
-; CHECK-X64-O2-NEXT:    notb %cl
-; CHECK-X64-O2-NEXT:    movq -48(%rsp,%rdx), %r10
-; CHECK-X64-O2-NEXT:    leaq (%r10,%r10), %r11
-; CHECK-X64-O2-NEXT:    shlq %cl, %r11
-; CHECK-X64-O2-NEXT:    orq %r8, %r11
-; CHECK-X64-O2-NEXT:    movq -40(%rsp,%rdx), %rdx
-; CHECK-X64-O2-NEXT:    movl %eax, %ecx
-; CHECK-X64-O2-NEXT:    shrdq %cl, %rdx, %r10
-; CHECK-X64-O2-NEXT:    shrdq %cl, %rdi, %rsi
+; CHECK-X64-O2-NEXT:    movl %r8d, %ecx
+; CHECK-X64-O2-NEXT:    shrb $6, %cl
+; CHECK-X64-O2-NEXT:    movzbl %cl, %edx
+; CHECK-X64-O2-NEXT:    movq -56(%rsp,%rdx,8), %rsi
+; CHECK-X64-O2-NEXT:    movq -72(%rsp,%rdx,8), %rdi
+; CHECK-X64-O2-NEXT:    movq -64(%rsp,%rdx,8), %r9
+; CHECK-X64-O2-NEXT:    movq %r9, %r10
+; CHECK-X64-O2-NEXT:    movl %r8d, %ecx
+; CHECK-X64-O2-NEXT:    shrdq %cl, %rsi, %r10
+; CHECK-X64-O2-NEXT:    movq -48(%rsp,%rdx,8), %rdx
+; CHECK-X64-O2-NEXT:    shrdq %cl, %rdx, %rsi
+; CHECK-X64-O2-NEXT:    shrdq %cl, %r9, %rdi
 ; CHECK-X64-O2-NEXT:    sarq %cl, %rdx
-; CHECK-X64-O2-NEXT:    movq %rdx, 24(%r9)
-; CHECK-X64-O2-NEXT:    movq %r10, 16(%r9)
-; CHECK-X64-O2-NEXT:    movq %rsi, (%r9)
-; CHECK-X64-O2-NEXT:    movq %r11, 8(%r9)
+; CHECK-X64-O2-NEXT:    movq %rdx, 24(%rax)
+; CHECK-X64-O2-NEXT:    movq %rsi, 16(%rax)
+; CHECK-X64-O2-NEXT:    movq %r10, 8(%rax)
+; CHECK-X64-O2-NEXT:    movq %rdi, (%rax)
 ; CHECK-X64-O2-NEXT:    retq
 entry:
 	%0 = ashr i256 %x, %a
@@ -199,11 +166,13 @@ define i256 @shift2(i256 %c) nounwind
 ; CHECK-LABEL: shift2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
 ; CHECK-NEXT:    pushl %ebx
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    subl $92, %esp
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    andl $-16, %esp
+; CHECK-NEXT:    subl $112, %esp
+; CHECK-NEXT:    movl 12(%ebp), %ecx
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -220,68 +189,54 @@ define i256 @shift2(i256 %c) nounwind
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movb %al, %ch
-; CHECK-NEXT:    andb $7, %ch
+; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    shrb $3, %al
+; CHECK-NEXT:    andb $28, %al
 ; CHECK-NEXT:    negb %al
 ; CHECK-NEXT:    movsbl %al, %eax
-; CHECK-NEXT:    movl 68(%esp,%eax), %edx
-; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movb %ch, %cl
-; CHECK-NEXT:    shll %cl, %edx
-; CHECK-NEXT:    notb %cl
-; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; CHECK-NEXT:    movl 64(%esp,%eax), %ebp
-; CHECK-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    shrl %ebp
-; CHECK-NEXT:    shrl %cl, %ebp
-; CHECK-NEXT:    orl %edx, %ebp
-; CHECK-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 76(%esp,%eax), %edx
-; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movb %ch, %cl
-; CHECK-NEXT:    shll %cl, %edx
-; CHECK-NEXT:    movl 72(%esp,%eax), %ebx
-; CHECK-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    shrl %ebx
-; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-NEXT:    shrl %cl, %ebx
-; CHECK-NEXT:    orl %edx, %ebx
-; CHECK-NEXT:    movl 84(%esp,%eax), %esi
+; CHECK-NEXT:    movl 68(%esp,%eax), %esi
 ; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movb %ch, %cl
-; CHECK-NEXT:    shll %cl, %esi
-; CHECK-NEXT:    movl 80(%esp,%eax), %edi
-; CHECK-NEXT:    movl %edi, %edx
-; CHECK-NEXT:    shrl %edx
-; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-NEXT:    shrl %cl, %edx
-; CHECK-NEXT:    orl %esi, %edx
-; CHECK-NEXT:    movb %ch, %cl
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT:    movl 72(%esp,%eax), %edx
+; CHECK-NEXT:    movl %edx, %edi
 ; CHECK-NEXT:    shldl %cl, %esi, %edi
-; CHECK-NEXT:    movl 60(%esp,%eax), %ebp
-; CHECK-NEXT:    movl 88(%esp,%eax), %esi
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-NEXT:    shldl %cl, %eax, %esi
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl 76(%esp,%eax), %esi
+; CHECK-NEXT:    movl %esi, %edi
+; CHECK-NEXT:    shldl %cl, %edx, %edi
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl 80(%esp,%eax), %edx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    shldl %cl, %esi, %edi
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl 84(%esp,%eax), %esi
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    shldl %cl, %edx, %ebx
+; CHECK-NEXT:    movl 88(%esp,%eax), %edi
+; CHECK-NEXT:    movl %edi, %edx
+; CHECK-NEXT:    shldl %cl, %esi, %edx
+; CHECK-NEXT:    movl 64(%esp,%eax), %esi
+; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl 92(%esp,%eax), %esi
+; CHECK-NEXT:    shldl %cl, %edi, %esi
+; CHECK-NEXT:    movl 8(%ebp), %eax
 ; CHECK-NEXT:    movl %esi, 28(%eax)
-; CHECK-NEXT:    movl %edi, 20(%eax)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-NEXT:    movl %esi, 12(%eax)
-; CHECK-NEXT:    movl %ebp, %esi
-; CHECK-NEXT:    shll %cl, %esi
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; CHECK-NEXT:    shldl %cl, %ebp, %edi
-; CHECK-NEXT:    movl %edi, 4(%eax)
-; CHECK-NEXT:    movl %esi, (%eax)
 ; CHECK-NEXT:    movl %edx, 24(%eax)
-; CHECK-NEXT:    movl %ebx, 16(%eax)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, 8(%eax)
-; CHECK-NEXT:    addl $92, %esp
+; CHECK-NEXT:    movl %ebx, 20(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT:    movl %edx, 16(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT:    movl %edx, 12(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT:    movl %edx, 8(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    movl %edi, %edx
+; CHECK-NEXT:    shll %cl, %edx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT:    shldl %cl, %edi, %esi
+; CHECK-NEXT:    movl %esi, 4(%eax)
+; CHECK-NEXT:    movl %edx, (%eax)
+; CHECK-NEXT:    leal -12(%ebp), %esp
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    popl %ebx
@@ -299,77 +254,64 @@ define i256 @shift2(i256 %c) nounwind
 ; CHECK-X64-O0-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O0-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O0-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT:    movb %sil, %dl
-; CHECK-X64-O0-NEXT:    movb %dl, %cl
-; CHECK-X64-O0-NEXT:    andb $7, %cl
+; CHECK-X64-O0-NEXT:    movb %sil, %cl
 ; CHECK-X64-O0-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-X64-O0-NEXT:    movb %cl, %dl
 ; CHECK-X64-O0-NEXT:    shrb $3, %dl
+; CHECK-X64-O0-NEXT:    andb $24, %dl
 ; CHECK-X64-O0-NEXT:    negb %dl
-; CHECK-X64-O0-NEXT:    movsbq %dl, %rdx
-; CHECK-X64-O0-NEXT:    movq -16(%rsp,%rdx), %rsi
-; CHECK-X64-O0-NEXT:    movq %rsi, %r10
-; CHECK-X64-O0-NEXT:    shlq %cl, %r10
+; CHECK-X64-O0-NEXT:    movsbq %dl, %r8
+; CHECK-X64-O0-NEXT:    movq -40(%rsp,%r8), %r9
+; CHECK-X64-O0-NEXT:    movq -32(%rsp,%r8), %rdx
+; CHECK-X64-O0-NEXT:    movq -24(%rsp,%r8), %r10
+; CHECK-X64-O0-NEXT:    movq %r10, %rsi
+; CHECK-X64-O0-NEXT:    shldq %cl, %rdx, %rsi
 ; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT:    notb %cl
-; CHECK-X64-O0-NEXT:    movq -32(%rsp,%rdx), %r9
-; CHECK-X64-O0-NEXT:    movq -24(%rsp,%rdx), %r8
-; CHECK-X64-O0-NEXT:    movq %r8, %r11
-; CHECK-X64-O0-NEXT:    shrq %r11
-; CHECK-X64-O0-NEXT:    shrq %cl, %r11
-; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT:    orq %r11, %r10
-; CHECK-X64-O0-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-X64-O0-NEXT:    movq -8(%rsp,%rdx), %rdx
-; CHECK-X64-O0-NEXT:    shldq %cl, %rsi, %rdx
+; CHECK-X64-O0-NEXT:    movq -16(%rsp,%r8), %r8
+; CHECK-X64-O0-NEXT:    shldq %cl, %r10, %r8
 ; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT:    movq %r9, %rsi
-; CHECK-X64-O0-NEXT:    shlq %cl, %rsi
+; CHECK-X64-O0-NEXT:    movq %r9, %r10
+; CHECK-X64-O0-NEXT:    shlq %cl, %r10
 ; CHECK-X64-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT:    shldq %cl, %r9, %r8
+; CHECK-X64-O0-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-X64-O0-NEXT:    shldq %cl, %r9, %rdx
 ; CHECK-X64-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; CHECK-X64-O0-NEXT:    movq %r8, 8(%rdi)
-; CHECK-X64-O0-NEXT:    movq %rsi, (%rdi)
-; CHECK-X64-O0-NEXT:    movq %rdx, 24(%rdi)
-; CHECK-X64-O0-NEXT:    movq %rcx, 16(%rdi)
+; CHECK-X64-O0-NEXT:    movq %r8, 24(%rdi)
+; CHECK-X64-O0-NEXT:    movq %rsi, 16(%rdi)
+; CHECK-X64-O0-NEXT:    movq %rdx, 8(%rdi)
+; CHECK-X64-O0-NEXT:    movq %rcx, (%rdi)
 ; CHECK-X64-O0-NEXT:    retq
 ;
 ; CHECK-X64-O2-LABEL: shift2:
 ; CHECK-X64-O2:       # %bb.0:
+; CHECK-X64-O2-NEXT:    movq %rsi, %rcx
 ; CHECK-X64-O2-NEXT:    movq %rdi, %rax
-; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    xorps %xmm0, %xmm0
+; CHECK-X64-O2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; CHECK-X64-O2-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT:    movl %esi, %edx
-; CHECK-X64-O2-NEXT:    andb $7, %dl
-; CHECK-X64-O2-NEXT:    shrb $3, %sil
-; CHECK-X64-O2-NEXT:    negb %sil
-; CHECK-X64-O2-NEXT:    movsbq %sil, %rsi
-; CHECK-X64-O2-NEXT:    movq -16(%rsp,%rsi), %rdi
-; CHECK-X64-O2-NEXT:    movq %rdi, %r8
-; CHECK-X64-O2-NEXT:    movl %edx, %ecx
+; CHECK-X64-O2-NEXT:    movl %ecx, %edx
+; CHECK-X64-O2-NEXT:    shrb $3, %dl
+; CHECK-X64-O2-NEXT:    andb $24, %dl
+; CHECK-X64-O2-NEXT:    negb %dl
+; CHECK-X64-O2-NEXT:    movsbq %dl, %rdx
+; CHECK-X64-O2-NEXT:    movq -40(%rsp,%rdx), %rsi
+; CHECK-X64-O2-NEXT:    movq -32(%rsp,%rdx), %rdi
+; CHECK-X64-O2-NEXT:    movq -24(%rsp,%rdx), %r8
+; CHECK-X64-O2-NEXT:    movq %r8, %r9
+; CHECK-X64-O2-NEXT:    shldq %cl, %rdi, %r9
+; CHECK-X64-O2-NEXT:    movq -16(%rsp,%rdx), %rdx
+; CHECK-X64-O2-NEXT:    shldq %cl, %r8, %rdx
+; CHECK-X64-O2-NEXT:    movq %rsi, %r8
 ; CHECK-X64-O2-NEXT:    shlq %cl, %r8
-; CHECK-X64-O2-NEXT:    notb %cl
-; CHECK-X64-O2-NEXT:    movq -32(%rsp,%rsi), %r9
-; CHECK-X64-O2-NEXT:    movq -24(%rsp,%rsi), %r10
-; CHECK-X64-O2-NEXT:    movq %r10, %r11
-; CHECK-X64-O2-NEXT:    shrq %r11
-; CHECK-X64-O2-NEXT:    shrq %cl, %r11
-; CHECK-X64-O2-NEXT:    orq %r8, %r11
-; CHECK-X64-O2-NEXT:    movq -8(%rsp,%rsi), %rsi
-; CHECK-X64-O2-NEXT:    movl %edx, %ecx
-; CHECK-X64-O2-NEXT:    shldq %cl, %rdi, %rsi
-; CHECK-X64-O2-NEXT:    movq %r9, %rdi
-; CHECK-X64-O2-NEXT:    shlq %cl, %rdi
-; CHECK-X64-O2-NEXT:    shldq %cl, %r9, %r10
-; CHECK-X64-O2-NEXT:    movq %rsi, 24(%rax)
-; CHECK-X64-O2-NEXT:    movq %r10, 8(%rax)
-; CHECK-X64-O2-NEXT:    movq %rdi, (%rax)
-; CHECK-X64-O2-NEXT:    movq %r11, 16(%rax)
+; CHECK-X64-O2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-X64-O2-NEXT:    shldq %cl, %rsi, %rdi
+; CHECK-X64-O2-NEXT:    movq %rdx, 24(%rax)
+; CHECK-X64-O2-NEXT:    movq %r9, 16(%rax)
+; CHECK-X64-O2-NEXT:    movq %rdi, 8(%rax)
+; CHECK-X64-O2-NEXT:    movq %r8, (%rax)
 ; CHECK-X64-O2-NEXT:    retq
 {
   %b = shl i256 1, %c  ; %c must not be a constant

diff  --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
index e5affd86312efd..277525796824bd 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -646,7 +646,869 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rax, (%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    retq
 ;
-; X86-SSE2-LABEL: lshr_16bytes:
+; FALLBACK16-LABEL: lshr_16bytes:
+; FALLBACK16:       # %bb.0:
+; FALLBACK16-NEXT:    pushl %ebp
+; FALLBACK16-NEXT:    pushl %ebx
+; FALLBACK16-NEXT:    pushl %edi
+; FALLBACK16-NEXT:    pushl %esi
+; FALLBACK16-NEXT:    subl $60, %esp
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT:    movl (%ecx), %edx
+; FALLBACK16-NEXT:    movl 4(%ecx), %esi
+; FALLBACK16-NEXT:    movl 8(%ecx), %edi
+; FALLBACK16-NEXT:    movl 12(%ecx), %ecx
+; FALLBACK16-NEXT:    movb (%eax), %ah
+; FALLBACK16-NEXT:    movb %ah, %al
+; FALLBACK16-NEXT:    shlb $3, %al
+; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    andb $12, %ah
+; FALLBACK16-NEXT:    movzbl %ah, %ebp
+; FALLBACK16-NEXT:    movl 20(%esp,%ebp), %esi
+; FALLBACK16-NEXT:    movl %esi, %ebx
+; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    movl %eax, %edx
+; FALLBACK16-NEXT:    notb %dl
+; FALLBACK16-NEXT:    movl 24(%esp,%ebp), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl %ebx, %edi
+; FALLBACK16-NEXT:    movl 16(%esp,%ebp), %ebx
+; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    addl %esi, %esi
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %ebx, %esi
+; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK16-NEXT:    movl 28(%esp,%ebp), %ebx
+; FALLBACK16-NEXT:    leal (%ebx,%ebx), %ebp
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    movl %ebx, 12(%edx)
+; FALLBACK16-NEXT:    movl %ebp, 8(%edx)
+; FALLBACK16-NEXT:    movl %esi, (%edx)
+; FALLBACK16-NEXT:    movl %edi, 4(%edx)
+; FALLBACK16-NEXT:    addl $60, %esp
+; FALLBACK16-NEXT:    popl %esi
+; FALLBACK16-NEXT:    popl %edi
+; FALLBACK16-NEXT:    popl %ebx
+; FALLBACK16-NEXT:    popl %ebp
+; FALLBACK16-NEXT:    retl
+;
+; FALLBACK17-LABEL: lshr_16bytes:
+; FALLBACK17:       # %bb.0:
+; FALLBACK17-NEXT:    pushl %ebp
+; FALLBACK17-NEXT:    pushl %ebx
+; FALLBACK17-NEXT:    pushl %edi
+; FALLBACK17-NEXT:    pushl %esi
+; FALLBACK17-NEXT:    subl $44, %esp
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK17-NEXT:    movl (%edx), %esi
+; FALLBACK17-NEXT:    movl 4(%edx), %edi
+; FALLBACK17-NEXT:    movl 8(%edx), %ebx
+; FALLBACK17-NEXT:    movl 12(%edx), %edx
+; FALLBACK17-NEXT:    movb (%ecx), %ch
+; FALLBACK17-NEXT:    movb %ch, %cl
+; FALLBACK17-NEXT:    shlb $3, %cl
+; FALLBACK17-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, (%esp)
+; FALLBACK17-NEXT:    andb $12, %ch
+; FALLBACK17-NEXT:    movzbl %ch, %ebx
+; FALLBACK17-NEXT:    movl 8(%esp,%ebx), %esi
+; FALLBACK17-NEXT:    movl (%esp,%ebx), %edx
+; FALLBACK17-NEXT:    movl 4(%esp,%ebx), %ebp
+; FALLBACK17-NEXT:    movl %ebp, %edi
+; FALLBACK17-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK17-NEXT:    movl 12(%esp,%ebx), %ebx
+; FALLBACK17-NEXT:    shrdl %cl, %ebx, %esi
+; FALLBACK17-NEXT:    shrdl %cl, %ebp, %edx
+; FALLBACK17-NEXT:    shrl %cl, %ebx
+; FALLBACK17-NEXT:    movl %esi, 8(%eax)
+; FALLBACK17-NEXT:    movl %ebx, 12(%eax)
+; FALLBACK17-NEXT:    movl %edx, (%eax)
+; FALLBACK17-NEXT:    movl %edi, 4(%eax)
+; FALLBACK17-NEXT:    addl $44, %esp
+; FALLBACK17-NEXT:    popl %esi
+; FALLBACK17-NEXT:    popl %edi
+; FALLBACK17-NEXT:    popl %ebx
+; FALLBACK17-NEXT:    popl %ebp
+; FALLBACK17-NEXT:    retl
+;
+; FALLBACK18-LABEL: lshr_16bytes:
+; FALLBACK18:       # %bb.0:
+; FALLBACK18-NEXT:    pushl %ebp
+; FALLBACK18-NEXT:    pushl %ebx
+; FALLBACK18-NEXT:    pushl %edi
+; FALLBACK18-NEXT:    pushl %esi
+; FALLBACK18-NEXT:    subl $44, %esp
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK18-NEXT:    movl (%ecx), %edx
+; FALLBACK18-NEXT:    movl 4(%ecx), %esi
+; FALLBACK18-NEXT:    movl 8(%ecx), %edi
+; FALLBACK18-NEXT:    movl 12(%ecx), %ecx
+; FALLBACK18-NEXT:    movzbl (%eax), %ebx
+; FALLBACK18-NEXT:    movl %ebx, %eax
+; FALLBACK18-NEXT:    shlb $3, %al
+; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edx, (%esp)
+; FALLBACK18-NEXT:    andb $12, %bl
+; FALLBACK18-NEXT:    movzbl %bl, %esi
+; FALLBACK18-NEXT:    movl 4(%esp,%esi), %edi
+; FALLBACK18-NEXT:    movl 8(%esp,%esi), %ebx
+; FALLBACK18-NEXT:    shrxl %eax, %edi, %ebp
+; FALLBACK18-NEXT:    movl %eax, %edx
+; FALLBACK18-NEXT:    notb %dl
+; FALLBACK18-NEXT:    leal (%ebx,%ebx), %ecx
+; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ecx
+; FALLBACK18-NEXT:    orl %ebp, %ecx
+; FALLBACK18-NEXT:    shrxl %eax, (%esp,%esi), %ebp
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %edi
+; FALLBACK18-NEXT:    orl %ebp, %edi
+; FALLBACK18-NEXT:    shrxl %eax, %ebx, %ebx
+; FALLBACK18-NEXT:    movl 12(%esp,%esi), %esi
+; FALLBACK18-NEXT:    shrxl %eax, %esi, %eax
+; FALLBACK18-NEXT:    addl %esi, %esi
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %edx
+; FALLBACK18-NEXT:    orl %ebx, %edx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK18-NEXT:    movl %eax, 12(%esi)
+; FALLBACK18-NEXT:    movl %edx, 8(%esi)
+; FALLBACK18-NEXT:    movl %edi, (%esi)
+; FALLBACK18-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK18-NEXT:    addl $44, %esp
+; FALLBACK18-NEXT:    popl %esi
+; FALLBACK18-NEXT:    popl %edi
+; FALLBACK18-NEXT:    popl %ebx
+; FALLBACK18-NEXT:    popl %ebp
+; FALLBACK18-NEXT:    retl
+;
+; FALLBACK19-LABEL: lshr_16bytes:
+; FALLBACK19:       # %bb.0:
+; FALLBACK19-NEXT:    pushl %ebp
+; FALLBACK19-NEXT:    pushl %ebx
+; FALLBACK19-NEXT:    pushl %edi
+; FALLBACK19-NEXT:    pushl %esi
+; FALLBACK19-NEXT:    subl $44, %esp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK19-NEXT:    movl (%edx), %esi
+; FALLBACK19-NEXT:    movl 4(%edx), %edi
+; FALLBACK19-NEXT:    movl 8(%edx), %ebx
+; FALLBACK19-NEXT:    movl 12(%edx), %edx
+; FALLBACK19-NEXT:    movzbl (%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, %ecx
+; FALLBACK19-NEXT:    shlb $3, %cl
+; FALLBACK19-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %esi, (%esp)
+; FALLBACK19-NEXT:    andb $12, %al
+; FALLBACK19-NEXT:    movzbl %al, %eax
+; FALLBACK19-NEXT:    movl 8(%esp,%eax), %ebx
+; FALLBACK19-NEXT:    movl (%esp,%eax), %edx
+; FALLBACK19-NEXT:    movl 4(%esp,%eax), %esi
+; FALLBACK19-NEXT:    movl %esi, %edi
+; FALLBACK19-NEXT:    shrdl %cl, %ebx, %edi
+; FALLBACK19-NEXT:    movl 12(%esp,%eax), %eax
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK19-NEXT:    movl %ebx, 8(%ebp)
+; FALLBACK19-NEXT:    shrxl %ecx, %eax, %eax
+; FALLBACK19-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT:    movl %edx, (%ebp)
+; FALLBACK19-NEXT:    movl %edi, 4(%ebp)
+; FALLBACK19-NEXT:    addl $44, %esp
+; FALLBACK19-NEXT:    popl %esi
+; FALLBACK19-NEXT:    popl %edi
+; FALLBACK19-NEXT:    popl %ebx
+; FALLBACK19-NEXT:    popl %ebp
+; FALLBACK19-NEXT:    retl
+;
+; FALLBACK20-LABEL: lshr_16bytes:
+; FALLBACK20:       # %bb.0:
+; FALLBACK20-NEXT:    pushl %ebp
+; FALLBACK20-NEXT:    pushl %ebx
+; FALLBACK20-NEXT:    pushl %edi
+; FALLBACK20-NEXT:    pushl %esi
+; FALLBACK20-NEXT:    subl $60, %esp
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    movups (%ecx), %xmm0
+; FALLBACK20-NEXT:    movzbl (%eax), %ecx
+; FALLBACK20-NEXT:    movl %ecx, %eax
+; FALLBACK20-NEXT:    shlb $3, %al
+; FALLBACK20-NEXT:    xorps %xmm1, %xmm1
+; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    andb $12, %cl
+; FALLBACK20-NEXT:    movzbl %cl, %edi
+; FALLBACK20-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK20-NEXT:    movl 20(%esp,%edi), %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    movl %eax, %edx
+; FALLBACK20-NEXT:    notb %dl
+; FALLBACK20-NEXT:    addl %esi, %esi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl %ebx, %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 24(%esp,%edi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %esi
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    movl 28(%esp,%edi), %edi
+; FALLBACK20-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    orl %esi, %ebp
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %esi, %ebx
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    movl %edi, 12(%edx)
+; FALLBACK20-NEXT:    movl %ebx, 4(%edx)
+; FALLBACK20-NEXT:    movl %ebp, 8(%edx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, (%edx)
+; FALLBACK20-NEXT:    addl $60, %esp
+; FALLBACK20-NEXT:    popl %esi
+; FALLBACK20-NEXT:    popl %edi
+; FALLBACK20-NEXT:    popl %ebx
+; FALLBACK20-NEXT:    popl %ebp
+; FALLBACK20-NEXT:    retl
+;
+; FALLBACK21-LABEL: lshr_16bytes:
+; FALLBACK21:       # %bb.0:
+; FALLBACK21-NEXT:    pushl %ebp
+; FALLBACK21-NEXT:    pushl %ebx
+; FALLBACK21-NEXT:    pushl %edi
+; FALLBACK21-NEXT:    pushl %esi
+; FALLBACK21-NEXT:    subl $44, %esp
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK21-NEXT:    movups (%edx), %xmm0
+; FALLBACK21-NEXT:    movzbl (%ecx), %edx
+; FALLBACK21-NEXT:    movl %edx, %ecx
+; FALLBACK21-NEXT:    shlb $3, %cl
+; FALLBACK21-NEXT:    xorps %xmm1, %xmm1
+; FALLBACK21-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm0, (%esp)
+; FALLBACK21-NEXT:    andb $12, %dl
+; FALLBACK21-NEXT:    movzbl %dl, %ebx
+; FALLBACK21-NEXT:    movl 12(%esp,%ebx), %edx
+; FALLBACK21-NEXT:    movl 8(%esp,%ebx), %ebp
+; FALLBACK21-NEXT:    movl %ebp, %edi
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK21-NEXT:    movl (%esp,%ebx), %esi
+; FALLBACK21-NEXT:    movl 4(%esp,%ebx), %eax
+; FALLBACK21-NEXT:    movl %eax, %ebx
+; FALLBACK21-NEXT:    shrdl %cl, %ebp, %ebx
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT:    movl %ebx, 4(%ebp)
+; FALLBACK21-NEXT:    movl %edi, 8(%ebp)
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT:    shrl %cl, %edx
+; FALLBACK21-NEXT:    movl %edx, 12(%ebp)
+; FALLBACK21-NEXT:    movl %esi, (%ebp)
+; FALLBACK21-NEXT:    addl $44, %esp
+; FALLBACK21-NEXT:    popl %esi
+; FALLBACK21-NEXT:    popl %edi
+; FALLBACK21-NEXT:    popl %ebx
+; FALLBACK21-NEXT:    popl %ebp
+; FALLBACK21-NEXT:    retl
+;
+; FALLBACK22-LABEL: lshr_16bytes:
+; FALLBACK22:       # %bb.0:
+; FALLBACK22-NEXT:    pushl %ebp
+; FALLBACK22-NEXT:    pushl %ebx
+; FALLBACK22-NEXT:    pushl %edi
+; FALLBACK22-NEXT:    pushl %esi
+; FALLBACK22-NEXT:    subl $44, %esp
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT:    movups (%ecx), %xmm0
+; FALLBACK22-NEXT:    movzbl (%eax), %ecx
+; FALLBACK22-NEXT:    movl %ecx, %eax
+; FALLBACK22-NEXT:    shlb $3, %al
+; FALLBACK22-NEXT:    xorps %xmm1, %xmm1
+; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm0, (%esp)
+; FALLBACK22-NEXT:    andb $12, %cl
+; FALLBACK22-NEXT:    movzbl %cl, %edi
+; FALLBACK22-NEXT:    shrxl %eax, (%esp,%edi), %ebx
+; FALLBACK22-NEXT:    movl %eax, %ecx
+; FALLBACK22-NEXT:    notb %cl
+; FALLBACK22-NEXT:    movl 4(%esp,%edi), %ebp
+; FALLBACK22-NEXT:    movl 8(%esp,%edi), %esi
+; FALLBACK22-NEXT:    leal (%ebp,%ebp), %edx
+; FALLBACK22-NEXT:    shlxl %ecx, %edx, %edx
+; FALLBACK22-NEXT:    orl %ebx, %edx
+; FALLBACK22-NEXT:    shrxl %eax, %esi, %ebx
+; FALLBACK22-NEXT:    shrxl %eax, %ebp, %ebp
+; FALLBACK22-NEXT:    movl 12(%esp,%edi), %edi
+; FALLBACK22-NEXT:    shrxl %eax, %edi, %eax
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK22-NEXT:    orl %ebx, %edi
+; FALLBACK22-NEXT:    addl %esi, %esi
+; FALLBACK22-NEXT:    shlxl %ecx, %esi, %ecx
+; FALLBACK22-NEXT:    orl %ebp, %ecx
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK22-NEXT:    movl %eax, 12(%esi)
+; FALLBACK22-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK22-NEXT:    movl %edi, 8(%esi)
+; FALLBACK22-NEXT:    movl %edx, (%esi)
+; FALLBACK22-NEXT:    addl $44, %esp
+; FALLBACK22-NEXT:    popl %esi
+; FALLBACK22-NEXT:    popl %edi
+; FALLBACK22-NEXT:    popl %ebx
+; FALLBACK22-NEXT:    popl %ebp
+; FALLBACK22-NEXT:    retl
+;
+; FALLBACK23-LABEL: lshr_16bytes:
+; FALLBACK23:       # %bb.0:
+; FALLBACK23-NEXT:    pushl %ebp
+; FALLBACK23-NEXT:    pushl %ebx
+; FALLBACK23-NEXT:    pushl %edi
+; FALLBACK23-NEXT:    pushl %esi
+; FALLBACK23-NEXT:    subl $44, %esp
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK23-NEXT:    movups (%edx), %xmm0
+; FALLBACK23-NEXT:    movzbl (%ecx), %edx
+; FALLBACK23-NEXT:    movl %edx, %ecx
+; FALLBACK23-NEXT:    shlb $3, %cl
+; FALLBACK23-NEXT:    xorps %xmm1, %xmm1
+; FALLBACK23-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm0, (%esp)
+; FALLBACK23-NEXT:    andb $12, %dl
+; FALLBACK23-NEXT:    movzbl %dl, %ebx
+; FALLBACK23-NEXT:    movl 12(%esp,%ebx), %edx
+; FALLBACK23-NEXT:    movl 8(%esp,%ebx), %ebp
+; FALLBACK23-NEXT:    movl %ebp, %edi
+; FALLBACK23-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK23-NEXT:    movl (%esp,%ebx), %esi
+; FALLBACK23-NEXT:    movl 4(%esp,%ebx), %eax
+; FALLBACK23-NEXT:    movl %eax, %ebx
+; FALLBACK23-NEXT:    shrdl %cl, %ebp, %ebx
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK23-NEXT:    movl %ebx, 4(%ebp)
+; FALLBACK23-NEXT:    movl %edi, 8(%ebp)
+; FALLBACK23-NEXT:    shrxl %ecx, %edx, %edx
+; FALLBACK23-NEXT:    movl %edx, 12(%ebp)
+; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT:    movl %esi, (%ebp)
+; FALLBACK23-NEXT:    addl $44, %esp
+; FALLBACK23-NEXT:    popl %esi
+; FALLBACK23-NEXT:    popl %edi
+; FALLBACK23-NEXT:    popl %ebx
+; FALLBACK23-NEXT:    popl %ebp
+; FALLBACK23-NEXT:    retl
+;
+; FALLBACK24-LABEL: lshr_16bytes:
+; FALLBACK24:       # %bb.0:
+; FALLBACK24-NEXT:    pushl %ebp
+; FALLBACK24-NEXT:    pushl %ebx
+; FALLBACK24-NEXT:    pushl %edi
+; FALLBACK24-NEXT:    pushl %esi
+; FALLBACK24-NEXT:    subl $60, %esp
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK24-NEXT:    movzbl (%eax), %ecx
+; FALLBACK24-NEXT:    movl %ecx, %eax
+; FALLBACK24-NEXT:    shlb $3, %al
+; FALLBACK24-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK24-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    andb $12, %cl
+; FALLBACK24-NEXT:    movzbl %cl, %edi
+; FALLBACK24-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK24-NEXT:    movl 20(%esp,%edi), %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    movl %eax, %edx
+; FALLBACK24-NEXT:    notb %dl
+; FALLBACK24-NEXT:    addl %esi, %esi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl %ebx, %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 24(%esp,%edi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %esi
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    movl 28(%esp,%edi), %edi
+; FALLBACK24-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    orl %esi, %ebp
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %esi, %ebx
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    movl %edi, 12(%edx)
+; FALLBACK24-NEXT:    movl %ebx, 4(%edx)
+; FALLBACK24-NEXT:    movl %ebp, 8(%edx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, (%edx)
+; FALLBACK24-NEXT:    addl $60, %esp
+; FALLBACK24-NEXT:    popl %esi
+; FALLBACK24-NEXT:    popl %edi
+; FALLBACK24-NEXT:    popl %ebx
+; FALLBACK24-NEXT:    popl %ebp
+; FALLBACK24-NEXT:    retl
+;
+; FALLBACK25-LABEL: lshr_16bytes:
+; FALLBACK25:       # %bb.0:
+; FALLBACK25-NEXT:    pushl %ebp
+; FALLBACK25-NEXT:    pushl %ebx
+; FALLBACK25-NEXT:    pushl %edi
+; FALLBACK25-NEXT:    pushl %esi
+; FALLBACK25-NEXT:    subl $44, %esp
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK25-NEXT:    vmovups (%edx), %xmm0
+; FALLBACK25-NEXT:    movzbl (%ecx), %edx
+; FALLBACK25-NEXT:    movl %edx, %ecx
+; FALLBACK25-NEXT:    shlb $3, %cl
+; FALLBACK25-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK25-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovaps %xmm0, (%esp)
+; FALLBACK25-NEXT:    andb $12, %dl
+; FALLBACK25-NEXT:    movzbl %dl, %ebx
+; FALLBACK25-NEXT:    movl 12(%esp,%ebx), %edx
+; FALLBACK25-NEXT:    movl 8(%esp,%ebx), %ebp
+; FALLBACK25-NEXT:    movl %ebp, %edi
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK25-NEXT:    movl (%esp,%ebx), %esi
+; FALLBACK25-NEXT:    movl 4(%esp,%ebx), %eax
+; FALLBACK25-NEXT:    movl %eax, %ebx
+; FALLBACK25-NEXT:    shrdl %cl, %ebp, %ebx
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT:    movl %ebx, 4(%ebp)
+; FALLBACK25-NEXT:    movl %edi, 8(%ebp)
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT:    shrl %cl, %edx
+; FALLBACK25-NEXT:    movl %edx, 12(%ebp)
+; FALLBACK25-NEXT:    movl %esi, (%ebp)
+; FALLBACK25-NEXT:    addl $44, %esp
+; FALLBACK25-NEXT:    popl %esi
+; FALLBACK25-NEXT:    popl %edi
+; FALLBACK25-NEXT:    popl %ebx
+; FALLBACK25-NEXT:    popl %ebp
+; FALLBACK25-NEXT:    retl
+;
+; FALLBACK26-LABEL: lshr_16bytes:
+; FALLBACK26:       # %bb.0:
+; FALLBACK26-NEXT:    pushl %ebp
+; FALLBACK26-NEXT:    pushl %ebx
+; FALLBACK26-NEXT:    pushl %edi
+; FALLBACK26-NEXT:    pushl %esi
+; FALLBACK26-NEXT:    subl $44, %esp
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK26-NEXT:    movzbl (%eax), %ecx
+; FALLBACK26-NEXT:    movl %ecx, %eax
+; FALLBACK26-NEXT:    shlb $3, %al
+; FALLBACK26-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK26-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovaps %xmm0, (%esp)
+; FALLBACK26-NEXT:    andb $12, %cl
+; FALLBACK26-NEXT:    movzbl %cl, %edi
+; FALLBACK26-NEXT:    shrxl %eax, (%esp,%edi), %ebx
+; FALLBACK26-NEXT:    movl %eax, %ecx
+; FALLBACK26-NEXT:    notb %cl
+; FALLBACK26-NEXT:    movl 4(%esp,%edi), %ebp
+; FALLBACK26-NEXT:    movl 8(%esp,%edi), %esi
+; FALLBACK26-NEXT:    leal (%ebp,%ebp), %edx
+; FALLBACK26-NEXT:    shlxl %ecx, %edx, %edx
+; FALLBACK26-NEXT:    orl %ebx, %edx
+; FALLBACK26-NEXT:    shrxl %eax, %esi, %ebx
+; FALLBACK26-NEXT:    shrxl %eax, %ebp, %ebp
+; FALLBACK26-NEXT:    movl 12(%esp,%edi), %edi
+; FALLBACK26-NEXT:    shrxl %eax, %edi, %eax
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK26-NEXT:    orl %ebx, %edi
+; FALLBACK26-NEXT:    addl %esi, %esi
+; FALLBACK26-NEXT:    shlxl %ecx, %esi, %ecx
+; FALLBACK26-NEXT:    orl %ebp, %ecx
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK26-NEXT:    movl %eax, 12(%esi)
+; FALLBACK26-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK26-NEXT:    movl %edi, 8(%esi)
+; FALLBACK26-NEXT:    movl %edx, (%esi)
+; FALLBACK26-NEXT:    addl $44, %esp
+; FALLBACK26-NEXT:    popl %esi
+; FALLBACK26-NEXT:    popl %edi
+; FALLBACK26-NEXT:    popl %ebx
+; FALLBACK26-NEXT:    popl %ebp
+; FALLBACK26-NEXT:    retl
+;
+; FALLBACK27-LABEL: lshr_16bytes:
+; FALLBACK27:       # %bb.0:
+; FALLBACK27-NEXT:    pushl %ebp
+; FALLBACK27-NEXT:    pushl %ebx
+; FALLBACK27-NEXT:    pushl %edi
+; FALLBACK27-NEXT:    pushl %esi
+; FALLBACK27-NEXT:    subl $44, %esp
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK27-NEXT:    vmovups (%edx), %xmm0
+; FALLBACK27-NEXT:    movzbl (%ecx), %edx
+; FALLBACK27-NEXT:    movl %edx, %ecx
+; FALLBACK27-NEXT:    shlb $3, %cl
+; FALLBACK27-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK27-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovaps %xmm0, (%esp)
+; FALLBACK27-NEXT:    andb $12, %dl
+; FALLBACK27-NEXT:    movzbl %dl, %ebx
+; FALLBACK27-NEXT:    movl 12(%esp,%ebx), %edx
+; FALLBACK27-NEXT:    movl 8(%esp,%ebx), %ebp
+; FALLBACK27-NEXT:    movl %ebp, %edi
+; FALLBACK27-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK27-NEXT:    movl (%esp,%ebx), %esi
+; FALLBACK27-NEXT:    movl 4(%esp,%ebx), %eax
+; FALLBACK27-NEXT:    movl %eax, %ebx
+; FALLBACK27-NEXT:    shrdl %cl, %ebp, %ebx
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK27-NEXT:    movl %ebx, 4(%ebp)
+; FALLBACK27-NEXT:    movl %edi, 8(%ebp)
+; FALLBACK27-NEXT:    shrxl %ecx, %edx, %edx
+; FALLBACK27-NEXT:    movl %edx, 12(%ebp)
+; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT:    movl %esi, (%ebp)
+; FALLBACK27-NEXT:    addl $44, %esp
+; FALLBACK27-NEXT:    popl %esi
+; FALLBACK27-NEXT:    popl %edi
+; FALLBACK27-NEXT:    popl %ebx
+; FALLBACK27-NEXT:    popl %ebp
+; FALLBACK27-NEXT:    retl
+;
+; FALLBACK28-LABEL: lshr_16bytes:
+; FALLBACK28:       # %bb.0:
+; FALLBACK28-NEXT:    pushl %ebp
+; FALLBACK28-NEXT:    pushl %ebx
+; FALLBACK28-NEXT:    pushl %edi
+; FALLBACK28-NEXT:    pushl %esi
+; FALLBACK28-NEXT:    subl $60, %esp
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK28-NEXT:    movzbl (%eax), %ecx
+; FALLBACK28-NEXT:    movl %ecx, %eax
+; FALLBACK28-NEXT:    shlb $3, %al
+; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    andb $12, %cl
+; FALLBACK28-NEXT:    movzbl %cl, %edi
+; FALLBACK28-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK28-NEXT:    movl 20(%esp,%edi), %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    movl %eax, %edx
+; FALLBACK28-NEXT:    notb %dl
+; FALLBACK28-NEXT:    addl %esi, %esi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl %ebx, %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 24(%esp,%edi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %esi
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    movl 28(%esp,%edi), %edi
+; FALLBACK28-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    orl %esi, %ebp
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %esi, %ebx
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    movl %edi, 12(%edx)
+; FALLBACK28-NEXT:    movl %ebx, 4(%edx)
+; FALLBACK28-NEXT:    movl %ebp, 8(%edx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, (%edx)
+; FALLBACK28-NEXT:    addl $60, %esp
+; FALLBACK28-NEXT:    popl %esi
+; FALLBACK28-NEXT:    popl %edi
+; FALLBACK28-NEXT:    popl %ebx
+; FALLBACK28-NEXT:    popl %ebp
+; FALLBACK28-NEXT:    retl
+;
+; FALLBACK29-LABEL: lshr_16bytes:
+; FALLBACK29:       # %bb.0:
+; FALLBACK29-NEXT:    pushl %ebp
+; FALLBACK29-NEXT:    pushl %ebx
+; FALLBACK29-NEXT:    pushl %edi
+; FALLBACK29-NEXT:    pushl %esi
+; FALLBACK29-NEXT:    subl $44, %esp
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK29-NEXT:    vmovups (%edx), %xmm0
+; FALLBACK29-NEXT:    movzbl (%ecx), %edx
+; FALLBACK29-NEXT:    movl %edx, %ecx
+; FALLBACK29-NEXT:    shlb $3, %cl
+; FALLBACK29-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    vmovaps %xmm0, (%esp)
+; FALLBACK29-NEXT:    andb $12, %dl
+; FALLBACK29-NEXT:    movzbl %dl, %ebx
+; FALLBACK29-NEXT:    movl 12(%esp,%ebx), %edx
+; FALLBACK29-NEXT:    movl 8(%esp,%ebx), %ebp
+; FALLBACK29-NEXT:    movl %ebp, %edi
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK29-NEXT:    movl (%esp,%ebx), %esi
+; FALLBACK29-NEXT:    movl 4(%esp,%ebx), %eax
+; FALLBACK29-NEXT:    movl %eax, %ebx
+; FALLBACK29-NEXT:    shrdl %cl, %ebp, %ebx
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT:    movl %ebx, 4(%ebp)
+; FALLBACK29-NEXT:    movl %edi, 8(%ebp)
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT:    shrl %cl, %edx
+; FALLBACK29-NEXT:    movl %edx, 12(%ebp)
+; FALLBACK29-NEXT:    movl %esi, (%ebp)
+; FALLBACK29-NEXT:    addl $44, %esp
+; FALLBACK29-NEXT:    popl %esi
+; FALLBACK29-NEXT:    popl %edi
+; FALLBACK29-NEXT:    popl %ebx
+; FALLBACK29-NEXT:    popl %ebp
+; FALLBACK29-NEXT:    retl
+;
+; FALLBACK30-LABEL: lshr_16bytes:
+; FALLBACK30:       # %bb.0:
+; FALLBACK30-NEXT:    pushl %ebp
+; FALLBACK30-NEXT:    pushl %ebx
+; FALLBACK30-NEXT:    pushl %edi
+; FALLBACK30-NEXT:    pushl %esi
+; FALLBACK30-NEXT:    subl $44, %esp
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK30-NEXT:    movzbl (%eax), %ecx
+; FALLBACK30-NEXT:    movl %ecx, %eax
+; FALLBACK30-NEXT:    shlb $3, %al
+; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    vmovaps %xmm0, (%esp)
+; FALLBACK30-NEXT:    andb $12, %cl
+; FALLBACK30-NEXT:    movzbl %cl, %edi
+; FALLBACK30-NEXT:    shrxl %eax, (%esp,%edi), %ebx
+; FALLBACK30-NEXT:    movl %eax, %ecx
+; FALLBACK30-NEXT:    notb %cl
+; FALLBACK30-NEXT:    movl 4(%esp,%edi), %ebp
+; FALLBACK30-NEXT:    movl 8(%esp,%edi), %esi
+; FALLBACK30-NEXT:    leal (%ebp,%ebp), %edx
+; FALLBACK30-NEXT:    shlxl %ecx, %edx, %edx
+; FALLBACK30-NEXT:    orl %ebx, %edx
+; FALLBACK30-NEXT:    shrxl %eax, %esi, %ebx
+; FALLBACK30-NEXT:    shrxl %eax, %ebp, %ebp
+; FALLBACK30-NEXT:    movl 12(%esp,%edi), %edi
+; FALLBACK30-NEXT:    shrxl %eax, %edi, %eax
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK30-NEXT:    orl %ebx, %edi
+; FALLBACK30-NEXT:    addl %esi, %esi
+; FALLBACK30-NEXT:    shlxl %ecx, %esi, %ecx
+; FALLBACK30-NEXT:    orl %ebp, %ecx
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK30-NEXT:    movl %eax, 12(%esi)
+; FALLBACK30-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK30-NEXT:    movl %edi, 8(%esi)
+; FALLBACK30-NEXT:    movl %edx, (%esi)
+; FALLBACK30-NEXT:    addl $44, %esp
+; FALLBACK30-NEXT:    popl %esi
+; FALLBACK30-NEXT:    popl %edi
+; FALLBACK30-NEXT:    popl %ebx
+; FALLBACK30-NEXT:    popl %ebp
+; FALLBACK30-NEXT:    retl
+;
+; FALLBACK31-LABEL: lshr_16bytes:
+; FALLBACK31:       # %bb.0:
+; FALLBACK31-NEXT:    pushl %ebp
+; FALLBACK31-NEXT:    pushl %ebx
+; FALLBACK31-NEXT:    pushl %edi
+; FALLBACK31-NEXT:    pushl %esi
+; FALLBACK31-NEXT:    subl $44, %esp
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK31-NEXT:    vmovups (%edx), %xmm0
+; FALLBACK31-NEXT:    movzbl (%ecx), %edx
+; FALLBACK31-NEXT:    movl %edx, %ecx
+; FALLBACK31-NEXT:    shlb $3, %cl
+; FALLBACK31-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    vmovaps %xmm0, (%esp)
+; FALLBACK31-NEXT:    andb $12, %dl
+; FALLBACK31-NEXT:    movzbl %dl, %ebx
+; FALLBACK31-NEXT:    movl 12(%esp,%ebx), %edx
+; FALLBACK31-NEXT:    movl 8(%esp,%ebx), %ebp
+; FALLBACK31-NEXT:    movl %ebp, %edi
+; FALLBACK31-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK31-NEXT:    movl (%esp,%ebx), %esi
+; FALLBACK31-NEXT:    movl 4(%esp,%ebx), %eax
+; FALLBACK31-NEXT:    movl %eax, %ebx
+; FALLBACK31-NEXT:    shrdl %cl, %ebp, %ebx
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK31-NEXT:    movl %ebx, 4(%ebp)
+; FALLBACK31-NEXT:    movl %edi, 8(%ebp)
+; FALLBACK31-NEXT:    shrxl %ecx, %edx, %edx
+; FALLBACK31-NEXT:    movl %edx, 12(%ebp)
+; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT:    movl %esi, (%ebp)
+; FALLBACK31-NEXT:    addl $44, %esp
+; FALLBACK31-NEXT:    popl %esi
+; FALLBACK31-NEXT:    popl %edi
+; FALLBACK31-NEXT:    popl %ebx
+; FALLBACK31-NEXT:    popl %ebp
+; FALLBACK31-NEXT:    retl
+  %src = load i128, ptr %src.ptr, align 1
+  %byteOff = load i128, ptr %byteOff.ptr, align 1
+  %bitOff = shl i128 %byteOff, 3
+  %res = lshr i128 %src, %bitOff
+  store i128 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @lshr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; X64-NO-SHLD-NO-BMI2-LABEL: lshr_16bytes_dwordOff:
+; X64-NO-SHLD-NO-BMI2:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq (%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %eax
+; X64-NO-SHLD-NO-BMI2-NEXT:    shlb $5, %al
+; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT:    shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-NEXT:    leaq (%rdi,%rdi), %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    notb %cl
+; X64-NO-SHLD-NO-BMI2-NEXT:    shlq %cl, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    orq %r8, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT:    shrq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT:    xorl %ecx, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT:    testb $64, %al
+; X64-NO-SHLD-NO-BMI2-NEXT:    cmovneq %rdi, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    cmoveq %rdi, %rcx
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rcx, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rsi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT:    retq
+;
+; X64-HAVE-SHLD-NO-BMI2-LABEL: lshr_16bytes_dwordOff:
+; X64-HAVE-SHLD-NO-BMI2:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq (%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    shlb $5, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rdi, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    shrq %cl, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    shrdq %cl, %rdi, %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    xorl %edi, %edi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    testb $64, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmovneq %rsi, %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmoveq %rsi, %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    retq
+;
+; X64-NO-SHLD-HAVE-BMI2-LABEL: lshr_16bytes_dwordOff:
+; X64-NO-SHLD-HAVE-BMI2:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq 8(%rdi), %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlb $5, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shrxq %rcx, (%rdi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, %edi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    notb %dil
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    leaq (%rax,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlxq %rdi, %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    orq %rsi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shrxq %rcx, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    xorl %esi, %esi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmovneq %rax, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmoveq %rax, %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rsi, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-LABEL: lshr_16bytes_dwordOff:
+; X64-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq (%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $5, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdq %cl, %rdi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shrxq %rcx, %rdi, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    xorl %edi, %edi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovneq %rsi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmoveq %rsi, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    retq
+;
+; X86-SSE2-LABEL: lshr_16bytes_dwordOff:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebx
 ; X86-SSE2-NEXT:    pushl %edi
@@ -660,19 +1522,17 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    movl 8(%edx), %ebx
 ; X86-SSE2-NEXT:    movl 12(%edx), %edx
 ; X86-SSE2-NEXT:    movzbl (%ecx), %ecx
+; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %esi, (%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andl $15, %ecx
-; X86-SSE2-NEXT:    movl (%esp,%ecx), %edx
-; X86-SSE2-NEXT:    movl 4(%esp,%ecx), %esi
-; X86-SSE2-NEXT:    movl 12(%esp,%ecx), %edi
-; X86-SSE2-NEXT:    movl 8(%esp,%ecx), %ecx
+; X86-SSE2-NEXT:    andl $3, %ecx
+; X86-SSE2-NEXT:    movl (%esp,%ecx,4), %edx
+; X86-SSE2-NEXT:    movl 4(%esp,%ecx,4), %esi
+; X86-SSE2-NEXT:    movl 12(%esp,%ecx,4), %edi
+; X86-SSE2-NEXT:    movl 8(%esp,%ecx,4), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
 ; X86-SSE2-NEXT:    movl %edi, 12(%eax)
 ; X86-SSE2-NEXT:    movl %edx, (%eax)
@@ -683,46 +1543,47 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    popl %ebx
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-SSE42-LABEL: lshr_16bytes:
+; X86-SSE42-LABEL: lshr_16bytes_dwordOff:
 ; X86-SSE42:       # %bb.0:
-; X86-SSE42-NEXT:    subl $32, %esp
+; X86-SSE42-NEXT:    subl $44, %esp
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE42-NEXT:    movups (%edx), %xmm0
 ; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
 ; X86-SSE42-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE42-NEXT:    movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm0, (%esp)
-; X86-SSE42-NEXT:    andl $15, %ecx
-; X86-SSE42-NEXT:    movups (%esp,%ecx), %xmm0
+; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
+; X86-SSE42-NEXT:    andl $3, %ecx
+; X86-SSE42-NEXT:    movups (%esp,%ecx,4), %xmm0
 ; X86-SSE42-NEXT:    movups %xmm0, (%eax)
-; X86-SSE42-NEXT:    addl $32, %esp
+; X86-SSE42-NEXT:    addl $44, %esp
 ; X86-SSE42-NEXT:    retl
 ;
-; X86-AVX-LABEL: lshr_16bytes:
+; X86-AVX-LABEL: lshr_16bytes_dwordOff:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    subl $32, %esp
+; X86-AVX-NEXT:    subl $44, %esp
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-AVX-NEXT:    vmovups (%edx), %xmm0
 ; X86-AVX-NEXT:    movzbl (%ecx), %ecx
 ; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT:    vmovups %xmm1, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    vmovups %xmm0, (%esp)
-; X86-AVX-NEXT:    andl $15, %ecx
-; X86-AVX-NEXT:    vmovups (%esp,%ecx), %xmm0
+; X86-AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    vmovaps %xmm0, (%esp)
+; X86-AVX-NEXT:    andl $3, %ecx
+; X86-AVX-NEXT:    vmovups (%esp,%ecx,4), %xmm0
 ; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX-NEXT:    addl $32, %esp
+; X86-AVX-NEXT:    addl $44, %esp
 ; X86-AVX-NEXT:    retl
   %src = load i128, ptr %src.ptr, align 1
-  %byteOff = load i128, ptr %byteOff.ptr, align 1
-  %bitOff = shl i128 %byteOff, 3
+  %dwordOff = load i128, ptr %dwordOff.ptr, align 1
+  %bitOff = shl i128 %dwordOff, 5
   %res = lshr i128 %src, %bitOff
   store i128 %res, ptr %dst, align 1
   ret void
 }
+
 define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-LABEL: shl_16bytes:
 ; X64-NO-SHLD-NO-BMI2:       # %bb.0:
@@ -800,7 +1661,877 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rsi, (%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    retq
 ;
-; X86-SSE2-LABEL: shl_16bytes:
+; FALLBACK16-LABEL: shl_16bytes:
+; FALLBACK16:       # %bb.0:
+; FALLBACK16-NEXT:    pushl %ebp
+; FALLBACK16-NEXT:    pushl %ebx
+; FALLBACK16-NEXT:    pushl %edi
+; FALLBACK16-NEXT:    pushl %esi
+; FALLBACK16-NEXT:    subl $60, %esp
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT:    movl (%ecx), %ebx
+; FALLBACK16-NEXT:    movl 4(%ecx), %esi
+; FALLBACK16-NEXT:    movl 8(%ecx), %edi
+; FALLBACK16-NEXT:    movl 12(%ecx), %ecx
+; FALLBACK16-NEXT:    movb (%eax), %ah
+; FALLBACK16-NEXT:    movb %ah, %dh
+; FALLBACK16-NEXT:    shlb $3, %dh
+; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    andb $12, %ah
+; FALLBACK16-NEXT:    negb %ah
+; FALLBACK16-NEXT:    movsbl %ah, %ebp
+; FALLBACK16-NEXT:    movl 32(%esp,%ebp), %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 36(%esp,%ebp), %esi
+; FALLBACK16-NEXT:    movl %esi, %edi
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    movb %dh, %dl
+; FALLBACK16-NEXT:    notb %dl
+; FALLBACK16-NEXT:    shrl %ebx
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    orl %edi, %ebx
+; FALLBACK16-NEXT:    movl 44(%esp,%ebp), %eax
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    movl 40(%esp,%ebp), %edi
+; FALLBACK16-NEXT:    movl %edi, %ebp
+; FALLBACK16-NEXT:    shrl %ebp
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    shrl %esi
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    orl %edi, %esi
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    movl %edx, (%eax)
+; FALLBACK16-NEXT:    movl %esi, 8(%eax)
+; FALLBACK16-NEXT:    movl %ebp, 12(%eax)
+; FALLBACK16-NEXT:    movl %ebx, 4(%eax)
+; FALLBACK16-NEXT:    addl $60, %esp
+; FALLBACK16-NEXT:    popl %esi
+; FALLBACK16-NEXT:    popl %edi
+; FALLBACK16-NEXT:    popl %ebx
+; FALLBACK16-NEXT:    popl %ebp
+; FALLBACK16-NEXT:    retl
+;
+; FALLBACK17-LABEL: shl_16bytes:
+; FALLBACK17:       # %bb.0:
+; FALLBACK17-NEXT:    pushl %ebx
+; FALLBACK17-NEXT:    pushl %edi
+; FALLBACK17-NEXT:    pushl %esi
+; FALLBACK17-NEXT:    subl $32, %esp
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK17-NEXT:    movl (%edx), %esi
+; FALLBACK17-NEXT:    movl 4(%edx), %edi
+; FALLBACK17-NEXT:    movl 8(%edx), %ebx
+; FALLBACK17-NEXT:    movl 12(%edx), %edx
+; FALLBACK17-NEXT:    movb (%ecx), %ch
+; FALLBACK17-NEXT:    movb %ch, %cl
+; FALLBACK17-NEXT:    shlb $3, %cl
+; FALLBACK17-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK17-NEXT:    movaps %xmm0, (%esp)
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    andb $12, %ch
+; FALLBACK17-NEXT:    negb %ch
+; FALLBACK17-NEXT:    movsbl %ch, %edi
+; FALLBACK17-NEXT:    movl 24(%esp,%edi), %esi
+; FALLBACK17-NEXT:    movl 28(%esp,%edi), %edx
+; FALLBACK17-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK17-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK17-NEXT:    movl 20(%esp,%edi), %edi
+; FALLBACK17-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK17-NEXT:    shldl %cl, %ebx, %edi
+; FALLBACK17-NEXT:    shll %cl, %ebx
+; FALLBACK17-NEXT:    movl %esi, 8(%eax)
+; FALLBACK17-NEXT:    movl %edx, 12(%eax)
+; FALLBACK17-NEXT:    movl %ebx, (%eax)
+; FALLBACK17-NEXT:    movl %edi, 4(%eax)
+; FALLBACK17-NEXT:    addl $32, %esp
+; FALLBACK17-NEXT:    popl %esi
+; FALLBACK17-NEXT:    popl %edi
+; FALLBACK17-NEXT:    popl %ebx
+; FALLBACK17-NEXT:    retl
+;
+; FALLBACK18-LABEL: shl_16bytes:
+; FALLBACK18:       # %bb.0:
+; FALLBACK18-NEXT:    pushl %ebp
+; FALLBACK18-NEXT:    pushl %ebx
+; FALLBACK18-NEXT:    pushl %edi
+; FALLBACK18-NEXT:    pushl %esi
+; FALLBACK18-NEXT:    subl $44, %esp
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK18-NEXT:    movl (%ecx), %edx
+; FALLBACK18-NEXT:    movl 4(%ecx), %esi
+; FALLBACK18-NEXT:    movl 8(%ecx), %edi
+; FALLBACK18-NEXT:    movl 12(%ecx), %ecx
+; FALLBACK18-NEXT:    movzbl (%eax), %eax
+; FALLBACK18-NEXT:    movl %eax, %ebx
+; FALLBACK18-NEXT:    shlb $3, %bl
+; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK18-NEXT:    movaps %xmm0, (%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    andb $12, %al
+; FALLBACK18-NEXT:    negb %al
+; FALLBACK18-NEXT:    movsbl %al, %edx
+; FALLBACK18-NEXT:    movl 16(%esp,%edx), %edi
+; FALLBACK18-NEXT:    movl 20(%esp,%edx), %ecx
+; FALLBACK18-NEXT:    shlxl %ebx, %ecx, %esi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %ebp
+; FALLBACK18-NEXT:    movl %ebx, %eax
+; FALLBACK18-NEXT:    notb %al
+; FALLBACK18-NEXT:    shrl %edi
+; FALLBACK18-NEXT:    shrxl %eax, %edi, %edi
+; FALLBACK18-NEXT:    orl %esi, %edi
+; FALLBACK18-NEXT:    shlxl %ebx, 28(%esp,%edx), %esi
+; FALLBACK18-NEXT:    movl 24(%esp,%edx), %edx
+; FALLBACK18-NEXT:    shlxl %ebx, %edx, %ebx
+; FALLBACK18-NEXT:    shrl %edx
+; FALLBACK18-NEXT:    shrxl %eax, %edx, %edx
+; FALLBACK18-NEXT:    orl %esi, %edx
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %eax, %ecx, %eax
+; FALLBACK18-NEXT:    orl %ebx, %eax
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK18-NEXT:    movl %ebp, (%ecx)
+; FALLBACK18-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK18-NEXT:    movl %edx, 12(%ecx)
+; FALLBACK18-NEXT:    movl %edi, 4(%ecx)
+; FALLBACK18-NEXT:    addl $44, %esp
+; FALLBACK18-NEXT:    popl %esi
+; FALLBACK18-NEXT:    popl %edi
+; FALLBACK18-NEXT:    popl %ebx
+; FALLBACK18-NEXT:    popl %ebp
+; FALLBACK18-NEXT:    retl
+;
+; FALLBACK19-LABEL: shl_16bytes:
+; FALLBACK19:       # %bb.0:
+; FALLBACK19-NEXT:    pushl %ebp
+; FALLBACK19-NEXT:    pushl %ebx
+; FALLBACK19-NEXT:    pushl %edi
+; FALLBACK19-NEXT:    pushl %esi
+; FALLBACK19-NEXT:    subl $44, %esp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK19-NEXT:    movl (%edx), %esi
+; FALLBACK19-NEXT:    movl 4(%edx), %edi
+; FALLBACK19-NEXT:    movl 8(%edx), %ebx
+; FALLBACK19-NEXT:    movl 12(%edx), %edx
+; FALLBACK19-NEXT:    movzbl (%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, %ecx
+; FALLBACK19-NEXT:    shlb $3, %cl
+; FALLBACK19-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK19-NEXT:    movaps %xmm0, (%esp)
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    andb $12, %al
+; FALLBACK19-NEXT:    negb %al
+; FALLBACK19-NEXT:    movsbl %al, %eax
+; FALLBACK19-NEXT:    movl 24(%esp,%eax), %esi
+; FALLBACK19-NEXT:    movl 28(%esp,%eax), %edx
+; FALLBACK19-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK19-NEXT:    movl 16(%esp,%eax), %edi
+; FALLBACK19-NEXT:    movl 20(%esp,%eax), %eax
+; FALLBACK19-NEXT:    shldl %cl, %eax, %esi
+; FALLBACK19-NEXT:    shldl %cl, %edi, %eax
+; FALLBACK19-NEXT:    shlxl %ecx, %edi, %ecx
+; FALLBACK19-NEXT:    movl %esi, 8(%ebp)
+; FALLBACK19-NEXT:    movl %edx, 12(%ebp)
+; FALLBACK19-NEXT:    movl %ecx, (%ebp)
+; FALLBACK19-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK19-NEXT:    addl $44, %esp
+; FALLBACK19-NEXT:    popl %esi
+; FALLBACK19-NEXT:    popl %edi
+; FALLBACK19-NEXT:    popl %ebx
+; FALLBACK19-NEXT:    popl %ebp
+; FALLBACK19-NEXT:    retl
+;
+; FALLBACK20-LABEL: shl_16bytes:
+; FALLBACK20:       # %bb.0:
+; FALLBACK20-NEXT:    pushl %ebp
+; FALLBACK20-NEXT:    pushl %ebx
+; FALLBACK20-NEXT:    pushl %edi
+; FALLBACK20-NEXT:    pushl %esi
+; FALLBACK20-NEXT:    subl $60, %esp
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    movups (%ecx), %xmm0
+; FALLBACK20-NEXT:    movzbl (%eax), %ecx
+; FALLBACK20-NEXT:    movl %ecx, %eax
+; FALLBACK20-NEXT:    shlb $3, %al
+; FALLBACK20-NEXT:    xorps %xmm1, %xmm1
+; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    andb $12, %cl
+; FALLBACK20-NEXT:    negb %cl
+; FALLBACK20-NEXT:    movsbl %cl, %edi
+; FALLBACK20-NEXT:    movl 44(%esp,%edi), %ebx
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    movl %eax, %edx
+; FALLBACK20-NEXT:    notb %dl
+; FALLBACK20-NEXT:    movl 40(%esp,%edi), %ebp
+; FALLBACK20-NEXT:    movl %ebp, %esi
+; FALLBACK20-NEXT:    shrl %esi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    orl %ebx, %esi
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    movl 32(%esp,%edi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 36(%esp,%edi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %edi
+; FALLBACK20-NEXT:    shrl %edi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    orl %ebp, %edi
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %ebp
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    orl %ebx, %ebp
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    movl %eax, (%edx)
+; FALLBACK20-NEXT:    movl %ebp, 4(%edx)
+; FALLBACK20-NEXT:    movl %edi, 8(%edx)
+; FALLBACK20-NEXT:    movl %esi, 12(%edx)
+; FALLBACK20-NEXT:    addl $60, %esp
+; FALLBACK20-NEXT:    popl %esi
+; FALLBACK20-NEXT:    popl %edi
+; FALLBACK20-NEXT:    popl %ebx
+; FALLBACK20-NEXT:    popl %ebp
+; FALLBACK20-NEXT:    retl
+;
+; FALLBACK21-LABEL: shl_16bytes:
+; FALLBACK21:       # %bb.0:
+; FALLBACK21-NEXT:    pushl %ebp
+; FALLBACK21-NEXT:    pushl %ebx
+; FALLBACK21-NEXT:    pushl %edi
+; FALLBACK21-NEXT:    pushl %esi
+; FALLBACK21-NEXT:    subl $44, %esp
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK21-NEXT:    movups (%edx), %xmm0
+; FALLBACK21-NEXT:    movzbl (%ecx), %edx
+; FALLBACK21-NEXT:    movl %edx, %ecx
+; FALLBACK21-NEXT:    shlb $3, %cl
+; FALLBACK21-NEXT:    xorps %xmm1, %xmm1
+; FALLBACK21-NEXT:    movaps %xmm1, (%esp)
+; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    andb $12, %dl
+; FALLBACK21-NEXT:    negb %dl
+; FALLBACK21-NEXT:    movsbl %dl, %edi
+; FALLBACK21-NEXT:    movl 24(%esp,%edi), %esi
+; FALLBACK21-NEXT:    movl 28(%esp,%edi), %edx
+; FALLBACK21-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK21-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK21-NEXT:    movl 20(%esp,%edi), %edi
+; FALLBACK21-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK21-NEXT:    movl %ebx, %ebp
+; FALLBACK21-NEXT:    shll %cl, %ebp
+; FALLBACK21-NEXT:    shldl %cl, %ebx, %edi
+; FALLBACK21-NEXT:    movl %edi, 4(%eax)
+; FALLBACK21-NEXT:    movl %esi, 8(%eax)
+; FALLBACK21-NEXT:    movl %edx, 12(%eax)
+; FALLBACK21-NEXT:    movl %ebp, (%eax)
+; FALLBACK21-NEXT:    addl $44, %esp
+; FALLBACK21-NEXT:    popl %esi
+; FALLBACK21-NEXT:    popl %edi
+; FALLBACK21-NEXT:    popl %ebx
+; FALLBACK21-NEXT:    popl %ebp
+; FALLBACK21-NEXT:    retl
+;
+; FALLBACK22-LABEL: shl_16bytes:
+; FALLBACK22:       # %bb.0:
+; FALLBACK22-NEXT:    pushl %ebp
+; FALLBACK22-NEXT:    pushl %ebx
+; FALLBACK22-NEXT:    pushl %edi
+; FALLBACK22-NEXT:    pushl %esi
+; FALLBACK22-NEXT:    subl $44, %esp
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT:    movups (%ecx), %xmm0
+; FALLBACK22-NEXT:    movzbl (%eax), %ecx
+; FALLBACK22-NEXT:    movl %ecx, %eax
+; FALLBACK22-NEXT:    shlb $3, %al
+; FALLBACK22-NEXT:    xorps %xmm1, %xmm1
+; FALLBACK22-NEXT:    movaps %xmm1, (%esp)
+; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    andb $12, %cl
+; FALLBACK22-NEXT:    negb %cl
+; FALLBACK22-NEXT:    movsbl %cl, %ecx
+; FALLBACK22-NEXT:    shlxl %eax, 28(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    movl 24(%esp,%ecx), %edx
+; FALLBACK22-NEXT:    shlxl %eax, %edx, %edi
+; FALLBACK22-NEXT:    movl %eax, %ebx
+; FALLBACK22-NEXT:    notb %bl
+; FALLBACK22-NEXT:    shrl %edx
+; FALLBACK22-NEXT:    shrxl %ebx, %edx, %edx
+; FALLBACK22-NEXT:    orl %esi, %edx
+; FALLBACK22-NEXT:    movl 20(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    movl %esi, %ebp
+; FALLBACK22-NEXT:    shrl %ebp
+; FALLBACK22-NEXT:    shrxl %ebx, %ebp, %ebp
+; FALLBACK22-NEXT:    orl %edi, %ebp
+; FALLBACK22-NEXT:    shlxl %eax, %esi, %esi
+; FALLBACK22-NEXT:    movl 16(%esp,%ecx), %ecx
+; FALLBACK22-NEXT:    shlxl %eax, %ecx, %eax
+; FALLBACK22-NEXT:    shrl %ecx
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT:    orl %esi, %ecx
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK22-NEXT:    movl %eax, (%esi)
+; FALLBACK22-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK22-NEXT:    movl %ebp, 8(%esi)
+; FALLBACK22-NEXT:    movl %edx, 12(%esi)
+; FALLBACK22-NEXT:    addl $44, %esp
+; FALLBACK22-NEXT:    popl %esi
+; FALLBACK22-NEXT:    popl %edi
+; FALLBACK22-NEXT:    popl %ebx
+; FALLBACK22-NEXT:    popl %ebp
+; FALLBACK22-NEXT:    retl
+;
+; FALLBACK23-LABEL: shl_16bytes:
+; FALLBACK23:       # %bb.0:
+; FALLBACK23-NEXT:    pushl %ebp
+; FALLBACK23-NEXT:    pushl %ebx
+; FALLBACK23-NEXT:    pushl %edi
+; FALLBACK23-NEXT:    pushl %esi
+; FALLBACK23-NEXT:    subl $44, %esp
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK23-NEXT:    movups (%edx), %xmm0
+; FALLBACK23-NEXT:    movzbl (%ecx), %edx
+; FALLBACK23-NEXT:    movl %edx, %ecx
+; FALLBACK23-NEXT:    shlb $3, %cl
+; FALLBACK23-NEXT:    xorps %xmm1, %xmm1
+; FALLBACK23-NEXT:    movaps %xmm1, (%esp)
+; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    andb $12, %dl
+; FALLBACK23-NEXT:    negb %dl
+; FALLBACK23-NEXT:    movsbl %dl, %edi
+; FALLBACK23-NEXT:    movl 24(%esp,%edi), %esi
+; FALLBACK23-NEXT:    movl 28(%esp,%edi), %edx
+; FALLBACK23-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK23-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK23-NEXT:    movl 20(%esp,%edi), %edi
+; FALLBACK23-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK23-NEXT:    shlxl %ecx, %ebx, %ebp
+; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT:    shldl %cl, %ebx, %edi
+; FALLBACK23-NEXT:    movl %edi, 4(%eax)
+; FALLBACK23-NEXT:    movl %esi, 8(%eax)
+; FALLBACK23-NEXT:    movl %edx, 12(%eax)
+; FALLBACK23-NEXT:    movl %ebp, (%eax)
+; FALLBACK23-NEXT:    addl $44, %esp
+; FALLBACK23-NEXT:    popl %esi
+; FALLBACK23-NEXT:    popl %edi
+; FALLBACK23-NEXT:    popl %ebx
+; FALLBACK23-NEXT:    popl %ebp
+; FALLBACK23-NEXT:    retl
+;
+; FALLBACK24-LABEL: shl_16bytes:
+; FALLBACK24:       # %bb.0:
+; FALLBACK24-NEXT:    pushl %ebp
+; FALLBACK24-NEXT:    pushl %ebx
+; FALLBACK24-NEXT:    pushl %edi
+; FALLBACK24-NEXT:    pushl %esi
+; FALLBACK24-NEXT:    subl $60, %esp
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK24-NEXT:    movzbl (%eax), %ecx
+; FALLBACK24-NEXT:    movl %ecx, %eax
+; FALLBACK24-NEXT:    shlb $3, %al
+; FALLBACK24-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK24-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    andb $12, %cl
+; FALLBACK24-NEXT:    negb %cl
+; FALLBACK24-NEXT:    movsbl %cl, %edi
+; FALLBACK24-NEXT:    movl 44(%esp,%edi), %ebx
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    movl %eax, %edx
+; FALLBACK24-NEXT:    notb %dl
+; FALLBACK24-NEXT:    movl 40(%esp,%edi), %ebp
+; FALLBACK24-NEXT:    movl %ebp, %esi
+; FALLBACK24-NEXT:    shrl %esi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    orl %ebx, %esi
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    movl 32(%esp,%edi), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 36(%esp,%edi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %edi
+; FALLBACK24-NEXT:    shrl %edi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    orl %ebp, %edi
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %ebp
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    orl %ebx, %ebp
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    movl %eax, (%edx)
+; FALLBACK24-NEXT:    movl %ebp, 4(%edx)
+; FALLBACK24-NEXT:    movl %edi, 8(%edx)
+; FALLBACK24-NEXT:    movl %esi, 12(%edx)
+; FALLBACK24-NEXT:    addl $60, %esp
+; FALLBACK24-NEXT:    popl %esi
+; FALLBACK24-NEXT:    popl %edi
+; FALLBACK24-NEXT:    popl %ebx
+; FALLBACK24-NEXT:    popl %ebp
+; FALLBACK24-NEXT:    retl
+;
+; FALLBACK25-LABEL: shl_16bytes:
+; FALLBACK25:       # %bb.0:
+; FALLBACK25-NEXT:    pushl %ebp
+; FALLBACK25-NEXT:    pushl %ebx
+; FALLBACK25-NEXT:    pushl %edi
+; FALLBACK25-NEXT:    pushl %esi
+; FALLBACK25-NEXT:    subl $44, %esp
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK25-NEXT:    vmovups (%edx), %xmm0
+; FALLBACK25-NEXT:    movzbl (%ecx), %edx
+; FALLBACK25-NEXT:    movl %edx, %ecx
+; FALLBACK25-NEXT:    shlb $3, %cl
+; FALLBACK25-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK25-NEXT:    vmovaps %xmm1, (%esp)
+; FALLBACK25-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    andb $12, %dl
+; FALLBACK25-NEXT:    negb %dl
+; FALLBACK25-NEXT:    movsbl %dl, %edi
+; FALLBACK25-NEXT:    movl 24(%esp,%edi), %esi
+; FALLBACK25-NEXT:    movl 28(%esp,%edi), %edx
+; FALLBACK25-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK25-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK25-NEXT:    movl 20(%esp,%edi), %edi
+; FALLBACK25-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK25-NEXT:    movl %ebx, %ebp
+; FALLBACK25-NEXT:    shll %cl, %ebp
+; FALLBACK25-NEXT:    shldl %cl, %ebx, %edi
+; FALLBACK25-NEXT:    movl %edi, 4(%eax)
+; FALLBACK25-NEXT:    movl %esi, 8(%eax)
+; FALLBACK25-NEXT:    movl %edx, 12(%eax)
+; FALLBACK25-NEXT:    movl %ebp, (%eax)
+; FALLBACK25-NEXT:    addl $44, %esp
+; FALLBACK25-NEXT:    popl %esi
+; FALLBACK25-NEXT:    popl %edi
+; FALLBACK25-NEXT:    popl %ebx
+; FALLBACK25-NEXT:    popl %ebp
+; FALLBACK25-NEXT:    retl
+;
+; FALLBACK26-LABEL: shl_16bytes:
+; FALLBACK26:       # %bb.0:
+; FALLBACK26-NEXT:    pushl %ebp
+; FALLBACK26-NEXT:    pushl %ebx
+; FALLBACK26-NEXT:    pushl %edi
+; FALLBACK26-NEXT:    pushl %esi
+; FALLBACK26-NEXT:    subl $44, %esp
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK26-NEXT:    movzbl (%eax), %ecx
+; FALLBACK26-NEXT:    movl %ecx, %eax
+; FALLBACK26-NEXT:    shlb $3, %al
+; FALLBACK26-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK26-NEXT:    vmovaps %xmm1, (%esp)
+; FALLBACK26-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    andb $12, %cl
+; FALLBACK26-NEXT:    negb %cl
+; FALLBACK26-NEXT:    movsbl %cl, %ecx
+; FALLBACK26-NEXT:    shlxl %eax, 28(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    movl 24(%esp,%ecx), %edx
+; FALLBACK26-NEXT:    shlxl %eax, %edx, %edi
+; FALLBACK26-NEXT:    movl %eax, %ebx
+; FALLBACK26-NEXT:    notb %bl
+; FALLBACK26-NEXT:    shrl %edx
+; FALLBACK26-NEXT:    shrxl %ebx, %edx, %edx
+; FALLBACK26-NEXT:    orl %esi, %edx
+; FALLBACK26-NEXT:    movl 20(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    movl %esi, %ebp
+; FALLBACK26-NEXT:    shrl %ebp
+; FALLBACK26-NEXT:    shrxl %ebx, %ebp, %ebp
+; FALLBACK26-NEXT:    orl %edi, %ebp
+; FALLBACK26-NEXT:    shlxl %eax, %esi, %esi
+; FALLBACK26-NEXT:    movl 16(%esp,%ecx), %ecx
+; FALLBACK26-NEXT:    shlxl %eax, %ecx, %eax
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT:    orl %esi, %ecx
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK26-NEXT:    movl %eax, (%esi)
+; FALLBACK26-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK26-NEXT:    movl %ebp, 8(%esi)
+; FALLBACK26-NEXT:    movl %edx, 12(%esi)
+; FALLBACK26-NEXT:    addl $44, %esp
+; FALLBACK26-NEXT:    popl %esi
+; FALLBACK26-NEXT:    popl %edi
+; FALLBACK26-NEXT:    popl %ebx
+; FALLBACK26-NEXT:    popl %ebp
+; FALLBACK26-NEXT:    retl
+;
+; FALLBACK27-LABEL: shl_16bytes:
+; FALLBACK27:       # %bb.0:
+; FALLBACK27-NEXT:    pushl %ebp
+; FALLBACK27-NEXT:    pushl %ebx
+; FALLBACK27-NEXT:    pushl %edi
+; FALLBACK27-NEXT:    pushl %esi
+; FALLBACK27-NEXT:    subl $44, %esp
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK27-NEXT:    vmovups (%edx), %xmm0
+; FALLBACK27-NEXT:    movzbl (%ecx), %edx
+; FALLBACK27-NEXT:    movl %edx, %ecx
+; FALLBACK27-NEXT:    shlb $3, %cl
+; FALLBACK27-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK27-NEXT:    vmovaps %xmm1, (%esp)
+; FALLBACK27-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    andb $12, %dl
+; FALLBACK27-NEXT:    negb %dl
+; FALLBACK27-NEXT:    movsbl %dl, %edi
+; FALLBACK27-NEXT:    movl 24(%esp,%edi), %esi
+; FALLBACK27-NEXT:    movl 28(%esp,%edi), %edx
+; FALLBACK27-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK27-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK27-NEXT:    movl 20(%esp,%edi), %edi
+; FALLBACK27-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK27-NEXT:    shlxl %ecx, %ebx, %ebp
+; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT:    shldl %cl, %ebx, %edi
+; FALLBACK27-NEXT:    movl %edi, 4(%eax)
+; FALLBACK27-NEXT:    movl %esi, 8(%eax)
+; FALLBACK27-NEXT:    movl %edx, 12(%eax)
+; FALLBACK27-NEXT:    movl %ebp, (%eax)
+; FALLBACK27-NEXT:    addl $44, %esp
+; FALLBACK27-NEXT:    popl %esi
+; FALLBACK27-NEXT:    popl %edi
+; FALLBACK27-NEXT:    popl %ebx
+; FALLBACK27-NEXT:    popl %ebp
+; FALLBACK27-NEXT:    retl
+;
+; FALLBACK28-LABEL: shl_16bytes:
+; FALLBACK28:       # %bb.0:
+; FALLBACK28-NEXT:    pushl %ebp
+; FALLBACK28-NEXT:    pushl %ebx
+; FALLBACK28-NEXT:    pushl %edi
+; FALLBACK28-NEXT:    pushl %esi
+; FALLBACK28-NEXT:    subl $60, %esp
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK28-NEXT:    movzbl (%eax), %ecx
+; FALLBACK28-NEXT:    movl %ecx, %eax
+; FALLBACK28-NEXT:    shlb $3, %al
+; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    andb $12, %cl
+; FALLBACK28-NEXT:    negb %cl
+; FALLBACK28-NEXT:    movsbl %cl, %edi
+; FALLBACK28-NEXT:    movl 44(%esp,%edi), %ebx
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    movl %eax, %edx
+; FALLBACK28-NEXT:    notb %dl
+; FALLBACK28-NEXT:    movl 40(%esp,%edi), %ebp
+; FALLBACK28-NEXT:    movl %ebp, %esi
+; FALLBACK28-NEXT:    shrl %esi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    orl %ebx, %esi
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    movl 32(%esp,%edi), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 36(%esp,%edi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %edi
+; FALLBACK28-NEXT:    shrl %edi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    orl %ebp, %edi
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %ebp
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    orl %ebx, %ebp
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    movl %eax, (%edx)
+; FALLBACK28-NEXT:    movl %ebp, 4(%edx)
+; FALLBACK28-NEXT:    movl %edi, 8(%edx)
+; FALLBACK28-NEXT:    movl %esi, 12(%edx)
+; FALLBACK28-NEXT:    addl $60, %esp
+; FALLBACK28-NEXT:    popl %esi
+; FALLBACK28-NEXT:    popl %edi
+; FALLBACK28-NEXT:    popl %ebx
+; FALLBACK28-NEXT:    popl %ebp
+; FALLBACK28-NEXT:    retl
+;
+; FALLBACK29-LABEL: shl_16bytes:
+; FALLBACK29:       # %bb.0:
+; FALLBACK29-NEXT:    pushl %ebp
+; FALLBACK29-NEXT:    pushl %ebx
+; FALLBACK29-NEXT:    pushl %edi
+; FALLBACK29-NEXT:    pushl %esi
+; FALLBACK29-NEXT:    subl $44, %esp
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK29-NEXT:    vmovups (%edx), %xmm0
+; FALLBACK29-NEXT:    movzbl (%ecx), %edx
+; FALLBACK29-NEXT:    movl %edx, %ecx
+; FALLBACK29-NEXT:    shlb $3, %cl
+; FALLBACK29-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT:    vmovaps %xmm1, (%esp)
+; FALLBACK29-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    andb $12, %dl
+; FALLBACK29-NEXT:    negb %dl
+; FALLBACK29-NEXT:    movsbl %dl, %edi
+; FALLBACK29-NEXT:    movl 24(%esp,%edi), %esi
+; FALLBACK29-NEXT:    movl 28(%esp,%edi), %edx
+; FALLBACK29-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK29-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK29-NEXT:    movl 20(%esp,%edi), %edi
+; FALLBACK29-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK29-NEXT:    movl %ebx, %ebp
+; FALLBACK29-NEXT:    shll %cl, %ebp
+; FALLBACK29-NEXT:    shldl %cl, %ebx, %edi
+; FALLBACK29-NEXT:    movl %edi, 4(%eax)
+; FALLBACK29-NEXT:    movl %esi, 8(%eax)
+; FALLBACK29-NEXT:    movl %edx, 12(%eax)
+; FALLBACK29-NEXT:    movl %ebp, (%eax)
+; FALLBACK29-NEXT:    addl $44, %esp
+; FALLBACK29-NEXT:    popl %esi
+; FALLBACK29-NEXT:    popl %edi
+; FALLBACK29-NEXT:    popl %ebx
+; FALLBACK29-NEXT:    popl %ebp
+; FALLBACK29-NEXT:    retl
+;
+; FALLBACK30-LABEL: shl_16bytes:
+; FALLBACK30:       # %bb.0:
+; FALLBACK30-NEXT:    pushl %ebp
+; FALLBACK30-NEXT:    pushl %ebx
+; FALLBACK30-NEXT:    pushl %edi
+; FALLBACK30-NEXT:    pushl %esi
+; FALLBACK30-NEXT:    subl $44, %esp
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK30-NEXT:    movzbl (%eax), %ecx
+; FALLBACK30-NEXT:    movl %ecx, %eax
+; FALLBACK30-NEXT:    shlb $3, %al
+; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT:    vmovaps %xmm1, (%esp)
+; FALLBACK30-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    andb $12, %cl
+; FALLBACK30-NEXT:    negb %cl
+; FALLBACK30-NEXT:    movsbl %cl, %ecx
+; FALLBACK30-NEXT:    shlxl %eax, 28(%esp,%ecx), %esi
+; FALLBACK30-NEXT:    movl 24(%esp,%ecx), %edx
+; FALLBACK30-NEXT:    shlxl %eax, %edx, %edi
+; FALLBACK30-NEXT:    movl %eax, %ebx
+; FALLBACK30-NEXT:    notb %bl
+; FALLBACK30-NEXT:    shrl %edx
+; FALLBACK30-NEXT:    shrxl %ebx, %edx, %edx
+; FALLBACK30-NEXT:    orl %esi, %edx
+; FALLBACK30-NEXT:    movl 20(%esp,%ecx), %esi
+; FALLBACK30-NEXT:    movl %esi, %ebp
+; FALLBACK30-NEXT:    shrl %ebp
+; FALLBACK30-NEXT:    shrxl %ebx, %ebp, %ebp
+; FALLBACK30-NEXT:    orl %edi, %ebp
+; FALLBACK30-NEXT:    shlxl %eax, %esi, %esi
+; FALLBACK30-NEXT:    movl 16(%esp,%ecx), %ecx
+; FALLBACK30-NEXT:    shlxl %eax, %ecx, %eax
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT:    orl %esi, %ecx
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK30-NEXT:    movl %eax, (%esi)
+; FALLBACK30-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK30-NEXT:    movl %ebp, 8(%esi)
+; FALLBACK30-NEXT:    movl %edx, 12(%esi)
+; FALLBACK30-NEXT:    addl $44, %esp
+; FALLBACK30-NEXT:    popl %esi
+; FALLBACK30-NEXT:    popl %edi
+; FALLBACK30-NEXT:    popl %ebx
+; FALLBACK30-NEXT:    popl %ebp
+; FALLBACK30-NEXT:    retl
+;
+; FALLBACK31-LABEL: shl_16bytes:
+; FALLBACK31:       # %bb.0:
+; FALLBACK31-NEXT:    pushl %ebp
+; FALLBACK31-NEXT:    pushl %ebx
+; FALLBACK31-NEXT:    pushl %edi
+; FALLBACK31-NEXT:    pushl %esi
+; FALLBACK31-NEXT:    subl $44, %esp
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK31-NEXT:    vmovups (%edx), %xmm0
+; FALLBACK31-NEXT:    movzbl (%ecx), %edx
+; FALLBACK31-NEXT:    movl %edx, %ecx
+; FALLBACK31-NEXT:    shlb $3, %cl
+; FALLBACK31-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT:    vmovaps %xmm1, (%esp)
+; FALLBACK31-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    andb $12, %dl
+; FALLBACK31-NEXT:    negb %dl
+; FALLBACK31-NEXT:    movsbl %dl, %edi
+; FALLBACK31-NEXT:    movl 24(%esp,%edi), %esi
+; FALLBACK31-NEXT:    movl 28(%esp,%edi), %edx
+; FALLBACK31-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK31-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK31-NEXT:    movl 20(%esp,%edi), %edi
+; FALLBACK31-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK31-NEXT:    shlxl %ecx, %ebx, %ebp
+; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT:    shldl %cl, %ebx, %edi
+; FALLBACK31-NEXT:    movl %edi, 4(%eax)
+; FALLBACK31-NEXT:    movl %esi, 8(%eax)
+; FALLBACK31-NEXT:    movl %edx, 12(%eax)
+; FALLBACK31-NEXT:    movl %ebp, (%eax)
+; FALLBACK31-NEXT:    addl $44, %esp
+; FALLBACK31-NEXT:    popl %esi
+; FALLBACK31-NEXT:    popl %edi
+; FALLBACK31-NEXT:    popl %ebx
+; FALLBACK31-NEXT:    popl %ebp
+; FALLBACK31-NEXT:    retl
+  %src = load i128, ptr %src.ptr, align 1
+  %byteOff = load i128, ptr %byteOff.ptr, align 1
+  %bitOff = shl i128 %byteOff, 3
+  %res = shl i128 %src, %bitOff
+  store i128 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @shl_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; X64-NO-SHLD-NO-BMI2-LABEL: shl_16bytes_dwordOff:
+; X64-NO-SHLD-NO-BMI2:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq (%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %eax
+; X64-NO-SHLD-NO-BMI2-NEXT:    shlb $5, %al
+; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT:    shlq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq %r8, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    shrq %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    notb %cl
+; X64-NO-SHLD-NO-BMI2-NEXT:    shrq %cl, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    orq %rdi, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT:    shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-NEXT:    xorl %ecx, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT:    testb $64, %al
+; X64-NO-SHLD-NO-BMI2-NEXT:    cmovneq %r8, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    cmoveq %r8, %rcx
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rcx, (%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rsi, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT:    retq
+;
+; X64-HAVE-SHLD-NO-BMI2-LABEL: shl_16bytes_dwordOff:
+; X64-HAVE-SHLD-NO-BMI2:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq (%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    shlb $5, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rax, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    shlq %cl, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    shldq %cl, %rax, %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    xorl %eax, %eax
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    testb $64, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmovneq %rsi, %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmoveq %rsi, %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    retq
+;
+; X64-NO-SHLD-HAVE-BMI2-LABEL: shl_16bytes_dwordOff:
+; X64-NO-SHLD-HAVE-BMI2:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq (%rdi), %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlb $5, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlxq %rcx, 8(%rdi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, %edi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    notb %dil
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlxq %rcx, %rax, %r8
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shrq %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shrxq %rdi, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    orq %rsi, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    xorl %esi, %esi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmovneq %r8, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmoveq %r8, %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rsi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-LABEL: shl_16bytes_dwordOff:
+; X64-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq (%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $5, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shldq %cl, %rax, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shlxq %rcx, %rax, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    xorl %esi, %esi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovneq %rax, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmoveq %rax, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    retq
+;
+; X86-SSE2-LABEL: shl_16bytes_dwordOff:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebx
 ; X86-SSE2-NEXT:    pushl %edi
@@ -814,15 +2545,14 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    movl 8(%edx), %ebx
 ; X86-SSE2-NEXT:    movl 12(%edx), %edx
 ; X86-SSE2-NEXT:    movzbl (%ecx), %ecx
+; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE2-NEXT:    movaps %xmm0, (%esp)
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, (%esp)
-; X86-SSE2-NEXT:    andb $15, %cl
+; X86-SSE2-NEXT:    shlb $2, %cl
+; X86-SSE2-NEXT:    andb $12, %cl
 ; X86-SSE2-NEXT:    negb %cl
 ; X86-SSE2-NEXT:    movsbl %cl, %ecx
 ; X86-SSE2-NEXT:    movl 16(%esp,%ecx), %edx
@@ -839,50 +2569,53 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    popl %ebx
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-SSE42-LABEL: shl_16bytes:
+; X86-SSE42-LABEL: shl_16bytes_dwordOff:
 ; X86-SSE42:       # %bb.0:
-; X86-SSE42-NEXT:    subl $32, %esp
+; X86-SSE42-NEXT:    subl $44, %esp
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE42-NEXT:    movups (%edx), %xmm0
 ; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
 ; X86-SSE42-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE42-NEXT:    movups %xmm1, (%esp)
-; X86-SSE42-NEXT:    movups %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    andb $15, %cl
+; X86-SSE42-NEXT:    movaps %xmm1, (%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    shlb $2, %cl
+; X86-SSE42-NEXT:    andb $12, %cl
 ; X86-SSE42-NEXT:    negb %cl
 ; X86-SSE42-NEXT:    movsbl %cl, %ecx
 ; X86-SSE42-NEXT:    movups 16(%esp,%ecx), %xmm0
 ; X86-SSE42-NEXT:    movups %xmm0, (%eax)
-; X86-SSE42-NEXT:    addl $32, %esp
+; X86-SSE42-NEXT:    addl $44, %esp
 ; X86-SSE42-NEXT:    retl
 ;
-; X86-AVX-LABEL: shl_16bytes:
+; X86-AVX-LABEL: shl_16bytes_dwordOff:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    subl $32, %esp
+; X86-AVX-NEXT:    subl $44, %esp
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-AVX-NEXT:    vmovups (%edx), %xmm0
 ; X86-AVX-NEXT:    movzbl (%ecx), %ecx
 ; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT:    vmovups %xmm1, (%esp)
-; X86-AVX-NEXT:    vmovups %xmm0, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    andb $15, %cl
+; X86-AVX-NEXT:    vmovaps %xmm1, (%esp)
+; X86-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    shlb $2, %cl
+; X86-AVX-NEXT:    andb $12, %cl
 ; X86-AVX-NEXT:    negb %cl
 ; X86-AVX-NEXT:    movsbl %cl, %ecx
 ; X86-AVX-NEXT:    vmovups 16(%esp,%ecx), %xmm0
 ; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX-NEXT:    addl $32, %esp
+; X86-AVX-NEXT:    addl $44, %esp
 ; X86-AVX-NEXT:    retl
   %src = load i128, ptr %src.ptr, align 1
-  %byteOff = load i128, ptr %byteOff.ptr, align 1
-  %bitOff = shl i128 %byteOff, 3
+  %dwordOff = load i128, ptr %dwordOff.ptr, align 1
+  %bitOff = shl i128 %dwordOff, 5
   %res = shl i128 %src, %bitOff
   store i128 %res, ptr %dst, align 1
   ret void
 }
+
 define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes:
 ; X64-NO-SHLD-NO-BMI2:       # %bb.0:
@@ -960,50 +2693,355 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rax, (%rdx)
 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    retq
 ;
-; X86-SSE2-LABEL: ashr_16bytes:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $32, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    movl (%edx), %esi
-; X86-SSE2-NEXT:    movl 4(%edx), %edi
-; X86-SSE2-NEXT:    movl 8(%edx), %ebx
-; X86-SSE2-NEXT:    movl 12(%edx), %edx
-; X86-SSE2-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, (%esp)
-; X86-SSE2-NEXT:    sarl $31, %edx
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andl $15, %ecx
-; X86-SSE2-NEXT:    movl (%esp,%ecx), %edx
-; X86-SSE2-NEXT:    movl 4(%esp,%ecx), %esi
-; X86-SSE2-NEXT:    movl 12(%esp,%ecx), %edi
-; X86-SSE2-NEXT:    movl 8(%esp,%ecx), %ecx
-; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-SSE2-NEXT:    movl %edi, 12(%eax)
-; X86-SSE2-NEXT:    movl %edx, (%eax)
-; X86-SSE2-NEXT:    movl %esi, 4(%eax)
-; X86-SSE2-NEXT:    addl $32, %esp
-; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
-; X86-SSE2-NEXT:    retl
+; X86-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes:
+; X86-NO-SHLD-NO-BMI2:       # %bb.0:
+; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    subl $60, %esp
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl (%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 4(%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 8(%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 12(%ecx), %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movb (%eax), %ah
+; X86-NO-SHLD-NO-BMI2-NEXT:    movb %ah, %al
+; X86-NO-SHLD-NO-BMI2-NEXT:    shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT:    sarl $31, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT:    andb $12, %ah
+; X86-NO-SHLD-NO-BMI2-NEXT:    movzbl %ah, %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 20(%esp,%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-NEXT:    notb %dl
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 24(%esp,%ebp), %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 16(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 28(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    leal (%ebx,%ebx), %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    sarl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ebx, 12(%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ebp, 8(%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, (%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, 4(%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT:    addl $60, %esp
+; X86-NO-SHLD-NO-BMI2-NEXT:    popl %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    popl %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    popl %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    popl %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT:    retl
 ;
-; X86-SSE42-LABEL: ashr_16bytes:
-; X86-SSE42:       # %bb.0:
-; X86-SSE42-NEXT:    pushl %ebx
-; X86-SSE42-NEXT:    pushl %edi
-; X86-SSE42-NEXT:    pushl %esi
-; X86-SSE42-NEXT:    subl $32, %esp
+; X86-HAVE-SHLD-NO-BMI2-LABEL: ashr_16bytes:
+; X86-HAVE-SHLD-NO-BMI2:       # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    subl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl (%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 8(%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 12(%edx), %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movb (%ecx), %ch
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movb %ch, %cl
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    sarl $31, %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    andb $12, %ch
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movzbl %ch, %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 8(%esp,%ebx), %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl (%esp,%ebx), %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ebp, %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 12(%esp,%ebx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %ebx, %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %ebp, %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    sarl %cl, %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, 8(%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ebx, 12(%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, (%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edi, 4(%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    addl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    retl
+;
+; X86-NO-SHLD-HAVE-BMI2-LABEL: ashr_16bytes:
+; X86-NO-SHLD-HAVE-BMI2:       # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    subl $44, %esp
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl (%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 8(%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 12(%ecx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edx, (%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    sarl $31, %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    andb $12, %bl
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl %bl, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 8(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %eax, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %eax, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    notb %dl
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    leal (%ebx,%ebx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %eax, (%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %eax, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 12(%esp,%esi), %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    sarxl %eax, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %edx, %esi, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ebx, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %eax, 12(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edx, 8(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edi, (%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, 4(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    addl $44, %esp
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_16bytes:
+; X86-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    subl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 8(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 12(%edx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %eax, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %esi, (%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    sarl $31, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    andb $12, %al
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl %al, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 8(%esp,%eax), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%esp,%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%esp,%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %esi, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %ebx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 12(%esp,%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %ebx, 8(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    sarxl %ecx, %eax, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edi, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    addl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    retl
+  %src = load i128, ptr %src.ptr, align 1
+  %byteOff = load i128, ptr %byteOff.ptr, align 1
+  %bitOff = shl i128 %byteOff, 3
+  %res = ashr i128 %src, %bitOff
+  store i128 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @ashr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; X64-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes_dwordOff:
+; X64-NO-SHLD-NO-BMI2:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq (%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %eax
+; X64-NO-SHLD-NO-BMI2-NEXT:    shlb $5, %al
+; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT:    shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-NEXT:    leaq (%rdi,%rdi), %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    notb %cl
+; X64-NO-SHLD-NO-BMI2-NEXT:    shlq %cl, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    orq %r8, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rdi, %r8
+; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT:    sarq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-NEXT:    sarq $63, %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT:    testb $64, %al
+; X64-NO-SHLD-NO-BMI2-NEXT:    cmovneq %r8, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    cmoveq %r8, %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rdi, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rsi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT:    retq
+;
+; X64-HAVE-SHLD-NO-BMI2-LABEL: ashr_16bytes_dwordOff:
+; X64-HAVE-SHLD-NO-BMI2:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq (%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    shlb $5, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rdi, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    sarq %cl, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    shrdq %cl, %rdi, %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    sarq $63, %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    testb $64, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmovneq %rsi, %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmoveq %rsi, %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    retq
+;
+; X64-NO-SHLD-HAVE-BMI2-LABEL: ashr_16bytes_dwordOff:
+; X64-NO-SHLD-HAVE-BMI2:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq 8(%rdi), %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlb $5, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shrxq %rcx, (%rdi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, %edi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    notb %dil
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    leaq (%rax,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlxq %rdi, %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    orq %rsi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    sarxq %rcx, %rax, %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    sarq $63, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmovneq %rsi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmoveq %rsi, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_16bytes_dwordOff:
+; X64-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq (%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $5, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdq %cl, %rdi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    sarxq %rcx, %rdi, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    sarq $63, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovneq %rsi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmoveq %rsi, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    retq
+;
+; X86-SSE2-LABEL: ashr_16bytes_dwordOff:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebx
+; X86-SSE2-NEXT:    pushl %edi
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    subl $32, %esp
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    movl (%edx), %esi
+; X86-SSE2-NEXT:    movl 4(%edx), %edi
+; X86-SSE2-NEXT:    movl 8(%edx), %ebx
+; X86-SSE2-NEXT:    movl 12(%edx), %edx
+; X86-SSE2-NEXT:    movzbl (%ecx), %ecx
+; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %esi, (%esp)
+; X86-SSE2-NEXT:    sarl $31, %edx
+; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    andl $3, %ecx
+; X86-SSE2-NEXT:    movl (%esp,%ecx,4), %edx
+; X86-SSE2-NEXT:    movl 4(%esp,%ecx,4), %esi
+; X86-SSE2-NEXT:    movl 12(%esp,%ecx,4), %edi
+; X86-SSE2-NEXT:    movl 8(%esp,%ecx,4), %ecx
+; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
+; X86-SSE2-NEXT:    movl %edi, 12(%eax)
+; X86-SSE2-NEXT:    movl %edx, (%eax)
+; X86-SSE2-NEXT:    movl %esi, 4(%eax)
+; X86-SSE2-NEXT:    addl $32, %esp
+; X86-SSE2-NEXT:    popl %esi
+; X86-SSE2-NEXT:    popl %edi
+; X86-SSE2-NEXT:    popl %ebx
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: ashr_16bytes_dwordOff:
+; X86-SSE42:       # %bb.0:
+; X86-SSE42-NEXT:    pushl %ebx
+; X86-SSE42-NEXT:    pushl %edi
+; X86-SSE42-NEXT:    pushl %esi
+; X86-SSE42-NEXT:    subl $32, %esp
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1021,8 +3059,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    andl $15, %ecx
-; X86-SSE42-NEXT:    movups (%esp,%ecx), %xmm0
+; X86-SSE42-NEXT:    andl $3, %ecx
+; X86-SSE42-NEXT:    movups (%esp,%ecx,4), %xmm0
 ; X86-SSE42-NEXT:    movups %xmm0, (%eax)
 ; X86-SSE42-NEXT:    addl $32, %esp
 ; X86-SSE42-NEXT:    popl %esi
@@ -1030,7 +3068,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE42-NEXT:    popl %ebx
 ; X86-SSE42-NEXT:    retl
 ;
-; X86-AVX-LABEL: ashr_16bytes:
+; X86-AVX-LABEL: ashr_16bytes_dwordOff:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    pushl %ebx
 ; X86-AVX-NEXT:    pushl %edi
@@ -1053,8 +3091,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    andl $15, %ecx
-; X86-AVX-NEXT:    vmovups (%esp,%ecx), %xmm0
+; X86-AVX-NEXT:    andl $3, %ecx
+; X86-AVX-NEXT:    vmovups (%esp,%ecx,4), %xmm0
 ; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
 ; X86-AVX-NEXT:    addl $32, %esp
 ; X86-AVX-NEXT:    popl %esi
@@ -1062,84 +3100,2562 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-AVX-NEXT:    popl %ebx
 ; X86-AVX-NEXT:    retl
   %src = load i128, ptr %src.ptr, align 1
-  %byteOff = load i128, ptr %byteOff.ptr, align 1
-  %bitOff = shl i128 %byteOff, 3
+  %dwordOff = load i128, ptr %dwordOff.ptr, align 1
+  %bitOff = shl i128 %dwordOff, 5
   %res = ashr i128 %src, %bitOff
   store i128 %res, ptr %dst, align 1
   ret void
 }
 
 define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: lshr_32bytes:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movq (%rdi), %rax
-; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
-; X64-SSE2-NEXT:    movq 16(%rdi), %r8
-; X64-SSE2-NEXT:    movq 24(%rdi), %rdi
-; X64-SSE2-NEXT:    movzbl (%rsi), %esi
-; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    andl $31, %esi
-; X64-SSE2-NEXT:    movq -64(%rsp,%rsi), %rax
-; X64-SSE2-NEXT:    movq -56(%rsp,%rsi), %rcx
-; X64-SSE2-NEXT:    movq -40(%rsp,%rsi), %rdi
-; X64-SSE2-NEXT:    movq -48(%rsp,%rsi), %rsi
-; X64-SSE2-NEXT:    movq %rsi, 16(%rdx)
-; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
-; X64-SSE2-NEXT:    movq %rax, (%rdx)
-; X64-SSE2-NEXT:    movq %rcx, 8(%rdx)
-; X64-SSE2-NEXT:    retq
+; FALLBACK0-LABEL: lshr_32bytes:
+; FALLBACK0:       # %bb.0:
+; FALLBACK0-NEXT:    pushq %rbx
+; FALLBACK0-NEXT:    movq (%rdi), %rcx
+; FALLBACK0-NEXT:    movq 8(%rdi), %r8
+; FALLBACK0-NEXT:    movq 16(%rdi), %r9
+; FALLBACK0-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK0-NEXT:    movzbl (%rsi), %esi
+; FALLBACK0-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK0-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    andb $24, %sil
+; FALLBACK0-NEXT:    movzbl %sil, %r9d
+; FALLBACK0-NEXT:    movq -64(%rsp,%r9), %r10
+; FALLBACK0-NEXT:    movq -56(%rsp,%r9), %rdi
+; FALLBACK0-NEXT:    movq %rdi, %r11
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r11
+; FALLBACK0-NEXT:    movl %eax, %esi
+; FALLBACK0-NEXT:    notb %sil
+; FALLBACK0-NEXT:    movq -48(%rsp,%r9), %rbx
+; FALLBACK0-NEXT:    leaq (%rbx,%rbx), %r8
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r8
+; FALLBACK0-NEXT:    orq %r11, %r8
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r10
+; FALLBACK0-NEXT:    addq %rdi, %rdi
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %rdi
+; FALLBACK0-NEXT:    orq %r10, %rdi
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %rbx
+; FALLBACK0-NEXT:    movq -40(%rsp,%r9), %r9
+; FALLBACK0-NEXT:    leaq (%r9,%r9), %r10
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r10
+; FALLBACK0-NEXT:    orq %rbx, %r10
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r9
+; FALLBACK0-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK0-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK0-NEXT:    movq %rdi, (%rdx)
+; FALLBACK0-NEXT:    movq %r8, 8(%rdx)
+; FALLBACK0-NEXT:    popq %rbx
+; FALLBACK0-NEXT:    retq
 ;
-; X64-SSE42-LABEL: lshr_32bytes:
-; X64-SSE42:       # %bb.0:
-; X64-SSE42-NEXT:    movups (%rdi), %xmm0
-; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
-; X64-SSE42-NEXT:    movzbl (%rsi), %eax
-; X64-SSE42-NEXT:    xorps %xmm2, %xmm2
-; X64-SSE42-NEXT:    movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    andl $31, %eax
-; X64-SSE42-NEXT:    movups -64(%rsp,%rax), %xmm0
-; X64-SSE42-NEXT:    movups -48(%rsp,%rax), %xmm1
-; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
-; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
-; X64-SSE42-NEXT:    retq
+; FALLBACK1-LABEL: lshr_32bytes:
+; FALLBACK1:       # %bb.0:
+; FALLBACK1-NEXT:    movq (%rdi), %rax
+; FALLBACK1-NEXT:    movq 8(%rdi), %r8
+; FALLBACK1-NEXT:    movq 16(%rdi), %r9
+; FALLBACK1-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK1-NEXT:    movzbl (%rsi), %esi
+; FALLBACK1-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK1-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    andb $24, %sil
+; FALLBACK1-NEXT:    movzbl %sil, %eax
+; FALLBACK1-NEXT:    movq -56(%rsp,%rax), %rsi
+; FALLBACK1-NEXT:    movq -72(%rsp,%rax), %rdi
+; FALLBACK1-NEXT:    movq -64(%rsp,%rax), %r8
+; FALLBACK1-NEXT:    movq %r8, %r9
+; FALLBACK1-NEXT:    shrdq %cl, %rsi, %r9
+; FALLBACK1-NEXT:    movq -48(%rsp,%rax), %rax
+; FALLBACK1-NEXT:    shrdq %cl, %rax, %rsi
+; FALLBACK1-NEXT:    shrdq %cl, %r8, %rdi
+; FALLBACK1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT:    shrq %cl, %rax
+; FALLBACK1-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK1-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK1-NEXT:    movq %rdi, (%rdx)
+; FALLBACK1-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK1-NEXT:    retq
 ;
-; X64-AVX-LABEL: lshr_32bytes:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX-NEXT:    movzbl (%rsi), %eax
-; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    andl $31, %eax
-; X64-AVX-NEXT:    vmovups -64(%rsp,%rax), %xmm0
-; X64-AVX-NEXT:    vmovups -48(%rsp,%rax), %xmm1
-; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
-; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
-; X64-AVX-NEXT:    vzeroupper
-; X64-AVX-NEXT:    retq
+; FALLBACK2-LABEL: lshr_32bytes:
+; FALLBACK2:       # %bb.0:
+; FALLBACK2-NEXT:    movq (%rdi), %rcx
+; FALLBACK2-NEXT:    movq 8(%rdi), %r8
+; FALLBACK2-NEXT:    movq 16(%rdi), %r9
+; FALLBACK2-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK2-NEXT:    movzbl (%rsi), %esi
+; FALLBACK2-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK2-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    andb $24, %sil
+; FALLBACK2-NEXT:    movzbl %sil, %ecx
+; FALLBACK2-NEXT:    movq -64(%rsp,%rcx), %rsi
+; FALLBACK2-NEXT:    movq -56(%rsp,%rcx), %rdi
+; FALLBACK2-NEXT:    shrxq %rax, %rsi, %r8
+; FALLBACK2-NEXT:    shrxq %rax, -72(%rsp,%rcx), %r9
+; FALLBACK2-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK2-NEXT:    movq -48(%rsp,%rcx), %rcx
+; FALLBACK2-NEXT:    shrxq %rax, %rcx, %r11
+; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    addq %rdi, %rdi
+; FALLBACK2-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK2-NEXT:    orq %r8, %rdi
+; FALLBACK2-NEXT:    addq %rsi, %rsi
+; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT:    orq %r9, %rsi
+; FALLBACK2-NEXT:    addq %rcx, %rcx
+; FALLBACK2-NEXT:    shlxq %rax, %rcx, %rax
+; FALLBACK2-NEXT:    orq %r10, %rax
+; FALLBACK2-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
+; FALLBACK2-NEXT:    movq %rsi, (%rdx)
+; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT:    retq
+;
+; FALLBACK3-LABEL: lshr_32bytes:
+; FALLBACK3:       # %bb.0:
+; FALLBACK3-NEXT:    movq (%rdi), %rax
+; FALLBACK3-NEXT:    movq 8(%rdi), %r8
+; FALLBACK3-NEXT:    movq 16(%rdi), %r9
+; FALLBACK3-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK3-NEXT:    movzbl (%rsi), %esi
+; FALLBACK3-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK3-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    andb $24, %sil
+; FALLBACK3-NEXT:    movzbl %sil, %eax
+; FALLBACK3-NEXT:    movq -56(%rsp,%rax), %rsi
+; FALLBACK3-NEXT:    movq -72(%rsp,%rax), %rdi
+; FALLBACK3-NEXT:    movq -64(%rsp,%rax), %r8
+; FALLBACK3-NEXT:    movq %r8, %r9
+; FALLBACK3-NEXT:    shrdq %cl, %rsi, %r9
+; FALLBACK3-NEXT:    movq -48(%rsp,%rax), %rax
+; FALLBACK3-NEXT:    shrdq %cl, %rax, %rsi
+; FALLBACK3-NEXT:    shrdq %cl, %r8, %rdi
+; FALLBACK3-NEXT:    shrxq %rcx, %rax, %rax
+; FALLBACK3-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK3-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK3-NEXT:    movq %rdi, (%rdx)
+; FALLBACK3-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK3-NEXT:    retq
+;
+; FALLBACK4-LABEL: lshr_32bytes:
+; FALLBACK4:       # %bb.0:
+; FALLBACK4-NEXT:    pushq %rbx
+; FALLBACK4-NEXT:    movups (%rdi), %xmm0
+; FALLBACK4-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK4-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK4-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    andb $24, %cl
+; FALLBACK4-NEXT:    movzbl %cl, %r9d
+; FALLBACK4-NEXT:    movq -64(%rsp,%r9), %r10
+; FALLBACK4-NEXT:    movq -56(%rsp,%r9), %r8
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r10
+; FALLBACK4-NEXT:    movl %eax, %esi
+; FALLBACK4-NEXT:    notb %sil
+; FALLBACK4-NEXT:    leaq (%r8,%r8), %rdi
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %rdi
+; FALLBACK4-NEXT:    orq %r10, %rdi
+; FALLBACK4-NEXT:    movq -48(%rsp,%r9), %r10
+; FALLBACK4-NEXT:    movq %r10, %r11
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r11
+; FALLBACK4-NEXT:    movq -40(%rsp,%r9), %r9
+; FALLBACK4-NEXT:    leaq (%r9,%r9), %rbx
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %rbx
+; FALLBACK4-NEXT:    orq %r11, %rbx
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r8
+; FALLBACK4-NEXT:    addq %r10, %r10
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r10
+; FALLBACK4-NEXT:    orq %r8, %r10
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r9
+; FALLBACK4-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK4-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK4-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK4-NEXT:    movq %rdi, (%rdx)
+; FALLBACK4-NEXT:    popq %rbx
+; FALLBACK4-NEXT:    retq
+;
+; FALLBACK5-LABEL: lshr_32bytes:
+; FALLBACK5:       # %bb.0:
+; FALLBACK5-NEXT:    movups (%rdi), %xmm0
+; FALLBACK5-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT:    movzbl (%rsi), %eax
+; FALLBACK5-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK5-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    andb $24, %al
+; FALLBACK5-NEXT:    movzbl %al, %eax
+; FALLBACK5-NEXT:    movq -48(%rsp,%rax), %rsi
+; FALLBACK5-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK5-NEXT:    movq %rdi, %r8
+; FALLBACK5-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK5-NEXT:    movq -72(%rsp,%rax), %r9
+; FALLBACK5-NEXT:    movq -64(%rsp,%rax), %rax
+; FALLBACK5-NEXT:    movq %rax, %r10
+; FALLBACK5-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK5-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK5-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT:    shrq %cl, %rsi
+; FALLBACK5-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK5-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK5-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT:    movq %r9, (%rdx)
+; FALLBACK5-NEXT:    retq
+;
+; FALLBACK6-LABEL: lshr_32bytes:
+; FALLBACK6:       # %bb.0:
+; FALLBACK6-NEXT:    movups (%rdi), %xmm0
+; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK6-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK6-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    andb $24, %cl
+; FALLBACK6-NEXT:    movzbl %cl, %ecx
+; FALLBACK6-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK6-NEXT:    movq -64(%rsp,%rcx), %rdi
+; FALLBACK6-NEXT:    movq -56(%rsp,%rcx), %r8
+; FALLBACK6-NEXT:    shrxq %rax, %r8, %r9
+; FALLBACK6-NEXT:    movq -48(%rsp,%rcx), %rcx
+; FALLBACK6-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK6-NEXT:    shrxq %rax, %rcx, %r11
+; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT:    notb %al
+; FALLBACK6-NEXT:    addq %rdi, %rdi
+; FALLBACK6-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT:    orq %rsi, %rdi
+; FALLBACK6-NEXT:    addq %rcx, %rcx
+; FALLBACK6-NEXT:    shlxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT:    orq %r9, %rcx
+; FALLBACK6-NEXT:    addq %r8, %r8
+; FALLBACK6-NEXT:    shlxq %rax, %r8, %rax
+; FALLBACK6-NEXT:    orq %r10, %rax
+; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT:    movq %rdi, (%rdx)
+; FALLBACK6-NEXT:    retq
+;
+; FALLBACK7-LABEL: lshr_32bytes:
+; FALLBACK7:       # %bb.0:
+; FALLBACK7-NEXT:    movups (%rdi), %xmm0
+; FALLBACK7-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT:    movzbl (%rsi), %eax
+; FALLBACK7-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK7-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    andb $24, %al
+; FALLBACK7-NEXT:    movzbl %al, %eax
+; FALLBACK7-NEXT:    movq -48(%rsp,%rax), %rsi
+; FALLBACK7-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK7-NEXT:    movq %rdi, %r8
+; FALLBACK7-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK7-NEXT:    movq -72(%rsp,%rax), %r9
+; FALLBACK7-NEXT:    movq -64(%rsp,%rax), %rax
+; FALLBACK7-NEXT:    movq %rax, %r10
+; FALLBACK7-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK7-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK7-NEXT:    shrxq %rcx, %rsi, %rax
+; FALLBACK7-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK7-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK7-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK7-NEXT:    movq %r9, (%rdx)
+; FALLBACK7-NEXT:    retq
 ;
-; X86-SSE2-LABEL: lshr_32bytes:
+; FALLBACK8-LABEL: lshr_32bytes:
+; FALLBACK8:       # %bb.0:
+; FALLBACK8-NEXT:    pushq %rbx
+; FALLBACK8-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK8-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK8-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK8-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    andb $24, %cl
+; FALLBACK8-NEXT:    movzbl %cl, %r9d
+; FALLBACK8-NEXT:    movq -64(%rsp,%r9), %r10
+; FALLBACK8-NEXT:    movq -56(%rsp,%r9), %r8
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r10
+; FALLBACK8-NEXT:    movl %eax, %esi
+; FALLBACK8-NEXT:    notb %sil
+; FALLBACK8-NEXT:    leaq (%r8,%r8), %rdi
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %rdi
+; FALLBACK8-NEXT:    orq %r10, %rdi
+; FALLBACK8-NEXT:    movq -48(%rsp,%r9), %r10
+; FALLBACK8-NEXT:    movq %r10, %r11
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r11
+; FALLBACK8-NEXT:    movq -40(%rsp,%r9), %r9
+; FALLBACK8-NEXT:    leaq (%r9,%r9), %rbx
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %rbx
+; FALLBACK8-NEXT:    orq %r11, %rbx
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r8
+; FALLBACK8-NEXT:    addq %r10, %r10
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r10
+; FALLBACK8-NEXT:    orq %r8, %r10
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r9
+; FALLBACK8-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK8-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK8-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK8-NEXT:    movq %rdi, (%rdx)
+; FALLBACK8-NEXT:    popq %rbx
+; FALLBACK8-NEXT:    vzeroupper
+; FALLBACK8-NEXT:    retq
+;
+; FALLBACK9-LABEL: lshr_32bytes:
+; FALLBACK9:       # %bb.0:
+; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT:    movzbl (%rsi), %eax
+; FALLBACK9-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK9-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK9-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    andb $24, %al
+; FALLBACK9-NEXT:    movzbl %al, %eax
+; FALLBACK9-NEXT:    movq -48(%rsp,%rax), %rsi
+; FALLBACK9-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK9-NEXT:    movq %rdi, %r8
+; FALLBACK9-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK9-NEXT:    movq -72(%rsp,%rax), %r9
+; FALLBACK9-NEXT:    movq -64(%rsp,%rax), %rax
+; FALLBACK9-NEXT:    movq %rax, %r10
+; FALLBACK9-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK9-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK9-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT:    shrq %cl, %rsi
+; FALLBACK9-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK9-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK9-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT:    movq %r9, (%rdx)
+; FALLBACK9-NEXT:    vzeroupper
+; FALLBACK9-NEXT:    retq
+;
+; FALLBACK10-LABEL: lshr_32bytes:
+; FALLBACK10:       # %bb.0:
+; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK10-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK10-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    andb $24, %cl
+; FALLBACK10-NEXT:    movzbl %cl, %ecx
+; FALLBACK10-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK10-NEXT:    movq -64(%rsp,%rcx), %rdi
+; FALLBACK10-NEXT:    movq -56(%rsp,%rcx), %r8
+; FALLBACK10-NEXT:    shrxq %rax, %r8, %r9
+; FALLBACK10-NEXT:    movq -48(%rsp,%rcx), %rcx
+; FALLBACK10-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK10-NEXT:    shrxq %rax, %rcx, %r11
+; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT:    notb %al
+; FALLBACK10-NEXT:    addq %rdi, %rdi
+; FALLBACK10-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT:    orq %rsi, %rdi
+; FALLBACK10-NEXT:    addq %rcx, %rcx
+; FALLBACK10-NEXT:    shlxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT:    orq %r9, %rcx
+; FALLBACK10-NEXT:    addq %r8, %r8
+; FALLBACK10-NEXT:    shlxq %rax, %r8, %rax
+; FALLBACK10-NEXT:    orq %r10, %rax
+; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT:    movq %rdi, (%rdx)
+; FALLBACK10-NEXT:    vzeroupper
+; FALLBACK10-NEXT:    retq
+;
+; FALLBACK11-LABEL: lshr_32bytes:
+; FALLBACK11:       # %bb.0:
+; FALLBACK11-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT:    movzbl (%rsi), %eax
+; FALLBACK11-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK11-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK11-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    andb $24, %al
+; FALLBACK11-NEXT:    movzbl %al, %eax
+; FALLBACK11-NEXT:    movq -48(%rsp,%rax), %rsi
+; FALLBACK11-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK11-NEXT:    movq %rdi, %r8
+; FALLBACK11-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK11-NEXT:    movq -72(%rsp,%rax), %r9
+; FALLBACK11-NEXT:    movq -64(%rsp,%rax), %rax
+; FALLBACK11-NEXT:    movq %rax, %r10
+; FALLBACK11-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK11-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK11-NEXT:    shrxq %rcx, %rsi, %rax
+; FALLBACK11-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK11-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK11-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK11-NEXT:    movq %r9, (%rdx)
+; FALLBACK11-NEXT:    vzeroupper
+; FALLBACK11-NEXT:    retq
+;
+; FALLBACK12-LABEL: lshr_32bytes:
+; FALLBACK12:       # %bb.0:
+; FALLBACK12-NEXT:    pushq %rbx
+; FALLBACK12-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK12-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK12-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK12-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    andb $24, %cl
+; FALLBACK12-NEXT:    movzbl %cl, %r9d
+; FALLBACK12-NEXT:    movq -64(%rsp,%r9), %r10
+; FALLBACK12-NEXT:    movq -56(%rsp,%r9), %r8
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r10
+; FALLBACK12-NEXT:    movl %eax, %esi
+; FALLBACK12-NEXT:    notb %sil
+; FALLBACK12-NEXT:    leaq (%r8,%r8), %rdi
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %rdi
+; FALLBACK12-NEXT:    orq %r10, %rdi
+; FALLBACK12-NEXT:    movq -48(%rsp,%r9), %r10
+; FALLBACK12-NEXT:    movq %r10, %r11
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r11
+; FALLBACK12-NEXT:    movq -40(%rsp,%r9), %r9
+; FALLBACK12-NEXT:    leaq (%r9,%r9), %rbx
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %rbx
+; FALLBACK12-NEXT:    orq %r11, %rbx
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r8
+; FALLBACK12-NEXT:    addq %r10, %r10
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r10
+; FALLBACK12-NEXT:    orq %r8, %r10
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r9
+; FALLBACK12-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK12-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK12-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK12-NEXT:    movq %rdi, (%rdx)
+; FALLBACK12-NEXT:    popq %rbx
+; FALLBACK12-NEXT:    vzeroupper
+; FALLBACK12-NEXT:    retq
+;
+; FALLBACK13-LABEL: lshr_32bytes:
+; FALLBACK13:       # %bb.0:
+; FALLBACK13-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK13-NEXT:    movzbl (%rsi), %eax
+; FALLBACK13-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK13-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    andb $24, %al
+; FALLBACK13-NEXT:    movzbl %al, %eax
+; FALLBACK13-NEXT:    movq -48(%rsp,%rax), %rsi
+; FALLBACK13-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK13-NEXT:    movq %rdi, %r8
+; FALLBACK13-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK13-NEXT:    movq -72(%rsp,%rax), %r9
+; FALLBACK13-NEXT:    movq -64(%rsp,%rax), %rax
+; FALLBACK13-NEXT:    movq %rax, %r10
+; FALLBACK13-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK13-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK13-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT:    shrq %cl, %rsi
+; FALLBACK13-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK13-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK13-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK13-NEXT:    movq %r9, (%rdx)
+; FALLBACK13-NEXT:    vzeroupper
+; FALLBACK13-NEXT:    retq
+;
+; FALLBACK14-LABEL: lshr_32bytes:
+; FALLBACK14:       # %bb.0:
+; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK14-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK14-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    andb $24, %cl
+; FALLBACK14-NEXT:    movzbl %cl, %ecx
+; FALLBACK14-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK14-NEXT:    movq -64(%rsp,%rcx), %rdi
+; FALLBACK14-NEXT:    movq -56(%rsp,%rcx), %r8
+; FALLBACK14-NEXT:    shrxq %rax, %r8, %r9
+; FALLBACK14-NEXT:    movq -48(%rsp,%rcx), %rcx
+; FALLBACK14-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK14-NEXT:    shrxq %rax, %rcx, %r11
+; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT:    notb %al
+; FALLBACK14-NEXT:    addq %rdi, %rdi
+; FALLBACK14-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT:    orq %rsi, %rdi
+; FALLBACK14-NEXT:    addq %rcx, %rcx
+; FALLBACK14-NEXT:    shlxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT:    orq %r9, %rcx
+; FALLBACK14-NEXT:    addq %r8, %r8
+; FALLBACK14-NEXT:    shlxq %rax, %r8, %rax
+; FALLBACK14-NEXT:    orq %r10, %rax
+; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT:    movq %rdi, (%rdx)
+; FALLBACK14-NEXT:    vzeroupper
+; FALLBACK14-NEXT:    retq
+;
+; FALLBACK15-LABEL: lshr_32bytes:
+; FALLBACK15:       # %bb.0:
+; FALLBACK15-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK15-NEXT:    movzbl (%rsi), %eax
+; FALLBACK15-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK15-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    andb $24, %al
+; FALLBACK15-NEXT:    movzbl %al, %eax
+; FALLBACK15-NEXT:    movq -48(%rsp,%rax), %rsi
+; FALLBACK15-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK15-NEXT:    movq %rdi, %r8
+; FALLBACK15-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK15-NEXT:    movq -72(%rsp,%rax), %r9
+; FALLBACK15-NEXT:    movq -64(%rsp,%rax), %rax
+; FALLBACK15-NEXT:    movq %rax, %r10
+; FALLBACK15-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK15-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK15-NEXT:    shrxq %rcx, %rsi, %rax
+; FALLBACK15-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK15-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK15-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK15-NEXT:    movq %r9, (%rdx)
+; FALLBACK15-NEXT:    vzeroupper
+; FALLBACK15-NEXT:    retq
+;
+; FALLBACK16-LABEL: lshr_32bytes:
+; FALLBACK16:       # %bb.0:
+; FALLBACK16-NEXT:    pushl %ebp
+; FALLBACK16-NEXT:    pushl %ebx
+; FALLBACK16-NEXT:    pushl %edi
+; FALLBACK16-NEXT:    pushl %esi
+; FALLBACK16-NEXT:    subl $108, %esp
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK16-NEXT:    movl (%ebp), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 4(%ebp), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 8(%ebp), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 12(%ebp), %edi
+; FALLBACK16-NEXT:    movl 16(%ebp), %ebx
+; FALLBACK16-NEXT:    movb (%eax), %ah
+; FALLBACK16-NEXT:    movl 20(%ebp), %esi
+; FALLBACK16-NEXT:    movl 24(%ebp), %ecx
+; FALLBACK16-NEXT:    movl 28(%ebp), %ebp
+; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movb %ah, %dh
+; FALLBACK16-NEXT:    shlb $3, %dh
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    andb $28, %ah
+; FALLBACK16-NEXT:    movzbl %ah, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 32(%esp,%edi), %esi
+; FALLBACK16-NEXT:    movl 36(%esp,%edi), %eax
+; FALLBACK16-NEXT:    movl %eax, %ebx
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    movb %dh, %dl
+; FALLBACK16-NEXT:    notb %dl
+; FALLBACK16-NEXT:    movl 40(%esp,%edi), %edi
+; FALLBACK16-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %ebx, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    movl %eax, %ebx
+; FALLBACK16-NEXT:    addl %eax, %ebx
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %esi, %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl 44(%esp,%eax), %ebp
+; FALLBACK16-NEXT:    movl %ebp, %esi
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    movl 48(%esp,%eax), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    leal (%eax,%eax), %ebx
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %esi, %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    addl %ebp, %ebp
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %edi, %ebp
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl 52(%esp,%eax), %edi
+; FALLBACK16-NEXT:    movl %edi, %ebx
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    movl 56(%esp,%eax), %esi
+; FALLBACK16-NEXT:    leal (%esi,%esi), %eax
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    orl %ebx, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    addl %edi, %edi
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl %ebx, %edi
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    movl %esi, %eax
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl 60(%esp,%ecx), %ebx
+; FALLBACK16-NEXT:    leal (%ebx,%ebx), %esi
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %eax, %esi
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK16-NEXT:    movl %esi, 24(%eax)
+; FALLBACK16-NEXT:    movl %edi, 16(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK16-NEXT:    movl %ebp, 8(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, (%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK16-NEXT:    addl $108, %esp
+; FALLBACK16-NEXT:    popl %esi
+; FALLBACK16-NEXT:    popl %edi
+; FALLBACK16-NEXT:    popl %ebx
+; FALLBACK16-NEXT:    popl %ebp
+; FALLBACK16-NEXT:    retl
+;
+; FALLBACK17-LABEL: lshr_32bytes:
+; FALLBACK17:       # %bb.0:
+; FALLBACK17-NEXT:    pushl %ebp
+; FALLBACK17-NEXT:    pushl %ebx
+; FALLBACK17-NEXT:    pushl %edi
+; FALLBACK17-NEXT:    pushl %esi
+; FALLBACK17-NEXT:    subl $92, %esp
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT:    movl (%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 4(%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 8(%ebp), %esi
+; FALLBACK17-NEXT:    movl 12(%ebp), %edi
+; FALLBACK17-NEXT:    movl 16(%ebp), %ebx
+; FALLBACK17-NEXT:    movb (%ecx), %ch
+; FALLBACK17-NEXT:    movl 20(%ebp), %edx
+; FALLBACK17-NEXT:    movl 24(%ebp), %eax
+; FALLBACK17-NEXT:    movl 28(%ebp), %ebp
+; FALLBACK17-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movb %ch, %cl
+; FALLBACK17-NEXT:    shlb $3, %cl
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    andb $28, %ch
+; FALLBACK17-NEXT:    movzbl %ch, %ebp
+; FALLBACK17-NEXT:    movl 24(%esp,%ebp), %edx
+; FALLBACK17-NEXT:    movl 20(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 32(%esp,%ebp), %ebx
+; FALLBACK17-NEXT:    movl 28(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %esi
+; FALLBACK17-NEXT:    shrdl %cl, %ebx, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 40(%esp,%ebp), %edx
+; FALLBACK17-NEXT:    movl 36(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edi
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK17-NEXT:    movl 16(%esp,%ebp), %esi
+; FALLBACK17-NEXT:    movl 44(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT:    movl %edx, 24(%ebp)
+; FALLBACK17-NEXT:    movl (%esp), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK17-NEXT:    shrl %cl, %eax
+; FALLBACK17-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK17-NEXT:    movl %ebx, 16(%ebp)
+; FALLBACK17-NEXT:    movl %edi, 20(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK17-NEXT:    movl %esi, (%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK17-NEXT:    addl $92, %esp
+; FALLBACK17-NEXT:    popl %esi
+; FALLBACK17-NEXT:    popl %edi
+; FALLBACK17-NEXT:    popl %ebx
+; FALLBACK17-NEXT:    popl %ebp
+; FALLBACK17-NEXT:    retl
+;
+; FALLBACK18-LABEL: lshr_32bytes:
+; FALLBACK18:       # %bb.0:
+; FALLBACK18-NEXT:    pushl %ebp
+; FALLBACK18-NEXT:    pushl %ebx
+; FALLBACK18-NEXT:    pushl %edi
+; FALLBACK18-NEXT:    pushl %esi
+; FALLBACK18-NEXT:    subl $108, %esp
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl (%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 4(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 8(%eax), %esi
+; FALLBACK18-NEXT:    movl 12(%eax), %edi
+; FALLBACK18-NEXT:    movl 16(%eax), %ebp
+; FALLBACK18-NEXT:    movzbl (%ebx), %ebx
+; FALLBACK18-NEXT:    movl 20(%eax), %edx
+; FALLBACK18-NEXT:    movl 24(%eax), %ecx
+; FALLBACK18-NEXT:    movl 28(%eax), %eax
+; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebx, %eax
+; FALLBACK18-NEXT:    shlb $3, %al
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    andb $28, %bl
+; FALLBACK18-NEXT:    movzbl %bl, %edi
+; FALLBACK18-NEXT:    movl 36(%esp,%edi), %esi
+; FALLBACK18-NEXT:    movl 40(%esp,%edi), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %eax, %esi, %edx
+; FALLBACK18-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl %eax, %edx
+; FALLBACK18-NEXT:    movl %eax, %ebx
+; FALLBACK18-NEXT:    notb %dl
+; FALLBACK18-NEXT:    leal (%ecx,%ecx), %ebp
+; FALLBACK18-NEXT:    shlxl %edx, %ebp, %eax
+; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl %ebx, %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, 32(%esp,%edi), %ebx
+; FALLBACK18-NEXT:    addl %esi, %esi
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK18-NEXT:    orl %ebx, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 48(%esp,%edi), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    leal (%eax,%eax), %ebx
+; FALLBACK18-NEXT:    shlxl %edx, %ebx, %esi
+; FALLBACK18-NEXT:    movl 44(%esp,%edi), %ebp
+; FALLBACK18-NEXT:    movl %ecx, %eax
+; FALLBACK18-NEXT:    shrxl %ecx, %ebp, %ebx
+; FALLBACK18-NEXT:    orl %ebx, %esi
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %eax, %ebx
+; FALLBACK18-NEXT:    addl %ebp, %ebp
+; FALLBACK18-NEXT:    shlxl %edx, %ebp, %eax
+; FALLBACK18-NEXT:    orl %ecx, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 56(%esp,%edi), %ebp
+; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ecx
+; FALLBACK18-NEXT:    movl 52(%esp,%edi), %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %eax, %esi
+; FALLBACK18-NEXT:    orl %esi, %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    addl %eax, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %eax, %esi
+; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT:    movl 60(%esp,%edi), %edi
+; FALLBACK18-NEXT:    shrxl %ebx, %edi, %ebx
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %edi
+; FALLBACK18-NEXT:    orl %eax, %edi
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK18-NEXT:    movl %edi, 24(%eax)
+; FALLBACK18-NEXT:    movl %esi, 16(%eax)
+; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, (%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK18-NEXT:    addl $108, %esp
+; FALLBACK18-NEXT:    popl %esi
+; FALLBACK18-NEXT:    popl %edi
+; FALLBACK18-NEXT:    popl %ebx
+; FALLBACK18-NEXT:    popl %ebp
+; FALLBACK18-NEXT:    retl
+;
+; FALLBACK19-LABEL: lshr_32bytes:
+; FALLBACK19:       # %bb.0:
+; FALLBACK19-NEXT:    pushl %ebp
+; FALLBACK19-NEXT:    pushl %ebx
+; FALLBACK19-NEXT:    pushl %edi
+; FALLBACK19-NEXT:    pushl %esi
+; FALLBACK19-NEXT:    subl $92, %esp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT:    movl (%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 4(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 8(%ecx), %esi
+; FALLBACK19-NEXT:    movl 12(%ecx), %edi
+; FALLBACK19-NEXT:    movl 16(%ecx), %ebp
+; FALLBACK19-NEXT:    movzbl (%ebx), %ebx
+; FALLBACK19-NEXT:    movl 20(%ecx), %edx
+; FALLBACK19-NEXT:    movl 24(%ecx), %eax
+; FALLBACK19-NEXT:    movl 28(%ecx), %ecx
+; FALLBACK19-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebx, %ecx
+; FALLBACK19-NEXT:    shlb $3, %cl
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    andb $28, %bl
+; FALLBACK19-NEXT:    movzbl %bl, %ebp
+; FALLBACK19-NEXT:    movl 24(%esp,%ebp), %esi
+; FALLBACK19-NEXT:    movl 20(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %esi, %eax
+; FALLBACK19-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 32(%esp,%ebp), %ebx
+; FALLBACK19-NEXT:    movl 28(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 40(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl 36(%esp,%ebp), %edx
+; FALLBACK19-NEXT:    movl %edx, %esi
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK19-NEXT:    movl 16(%esp,%ebp), %edx
+; FALLBACK19-NEXT:    movl 44(%esp,%ebp), %edi
+; FALLBACK19-NEXT:    shrdl %cl, %edi, %eax
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK19-NEXT:    shrxl %ecx, %edi, %eax
+; FALLBACK19-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK19-NEXT:    movl %ebx, 16(%ebp)
+; FALLBACK19-NEXT:    movl %esi, 20(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT:    movl %edx, (%ebp)
+; FALLBACK19-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK19-NEXT:    addl $92, %esp
+; FALLBACK19-NEXT:    popl %esi
+; FALLBACK19-NEXT:    popl %edi
+; FALLBACK19-NEXT:    popl %ebx
+; FALLBACK19-NEXT:    popl %ebp
+; FALLBACK19-NEXT:    retl
+;
+; FALLBACK20-LABEL: lshr_32bytes:
+; FALLBACK20:       # %bb.0:
+; FALLBACK20-NEXT:    pushl %ebp
+; FALLBACK20-NEXT:    pushl %ebx
+; FALLBACK20-NEXT:    pushl %edi
+; FALLBACK20-NEXT:    pushl %esi
+; FALLBACK20-NEXT:    subl $108, %esp
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    movups (%ecx), %xmm0
+; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT:    movzbl (%eax), %ecx
+; FALLBACK20-NEXT:    movl %ecx, %eax
+; FALLBACK20-NEXT:    shlb $3, %al
+; FALLBACK20-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    andb $28, %cl
+; FALLBACK20-NEXT:    movzbl %cl, %ecx
+; FALLBACK20-NEXT:    movl 32(%esp,%ecx), %esi
+; FALLBACK20-NEXT:    movl 36(%esp,%ecx), %ebx
+; FALLBACK20-NEXT:    movl %ecx, %edi
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    movl %eax, %edx
+; FALLBACK20-NEXT:    notb %dl
+; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %esi, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 44(%esp,%edi), %ebp
+; FALLBACK20-NEXT:    movl %ebp, %esi
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    movl 48(%esp,%edi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %esi, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 40(%esp,%edi), %esi
+; FALLBACK20-NEXT:    movl %esi, %ebx
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    addl %ebp, %ebp
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    orl %ebx, %ebp
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 52(%esp,%edi), %ebp
+; FALLBACK20-NEXT:    movl %ebp, %ebx
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    movl 56(%esp,%edi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    orl %ebx, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    addl %ebp, %ebp
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    orl %edi, %ebp
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl 60(%esp,%ecx), %ebx
+; FALLBACK20-NEXT:    leal (%ebx,%ebx), %edi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    addl %esi, %esi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK20-NEXT:    movl %esi, 4(%eax)
+; FALLBACK20-NEXT:    movl %edi, 24(%eax)
+; FALLBACK20-NEXT:    movl %ebp, 16(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, (%eax)
+; FALLBACK20-NEXT:    addl $108, %esp
+; FALLBACK20-NEXT:    popl %esi
+; FALLBACK20-NEXT:    popl %edi
+; FALLBACK20-NEXT:    popl %ebx
+; FALLBACK20-NEXT:    popl %ebp
+; FALLBACK20-NEXT:    retl
+;
+; FALLBACK21-LABEL: lshr_32bytes:
+; FALLBACK21:       # %bb.0:
+; FALLBACK21-NEXT:    pushl %ebp
+; FALLBACK21-NEXT:    pushl %ebx
+; FALLBACK21-NEXT:    pushl %edi
+; FALLBACK21-NEXT:    pushl %esi
+; FALLBACK21-NEXT:    subl $108, %esp
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT:    movups (%ecx), %xmm0
+; FALLBACK21-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK21-NEXT:    movzbl (%eax), %eax
+; FALLBACK21-NEXT:    movl %eax, %ecx
+; FALLBACK21-NEXT:    shlb $3, %cl
+; FALLBACK21-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    andb $28, %al
+; FALLBACK21-NEXT:    movzbl %al, %ebp
+; FALLBACK21-NEXT:    movl 48(%esp,%ebp), %esi
+; FALLBACK21-NEXT:    movl 44(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 40(%esp,%ebp), %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 56(%esp,%ebp), %ebx
+; FALLBACK21-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK21-NEXT:    movl 32(%esp,%ebp), %edx
+; FALLBACK21-NEXT:    movl 36(%esp,%ebp), %edi
+; FALLBACK21-NEXT:    movl %edi, %esi
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK21-NEXT:    shrdl %cl, %ebp, %esi
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT:    movl %esi, 4(%ebp)
+; FALLBACK21-NEXT:    movl %ebx, 24(%ebp)
+; FALLBACK21-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK21-NEXT:    shrl %cl, %eax
+; FALLBACK21-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK21-NEXT:    movl %edx, (%ebp)
+; FALLBACK21-NEXT:    addl $108, %esp
+; FALLBACK21-NEXT:    popl %esi
+; FALLBACK21-NEXT:    popl %edi
+; FALLBACK21-NEXT:    popl %ebx
+; FALLBACK21-NEXT:    popl %ebp
+; FALLBACK21-NEXT:    retl
+;
+; FALLBACK22-LABEL: lshr_32bytes:
+; FALLBACK22:       # %bb.0:
+; FALLBACK22-NEXT:    pushl %ebp
+; FALLBACK22-NEXT:    pushl %ebx
+; FALLBACK22-NEXT:    pushl %edi
+; FALLBACK22-NEXT:    pushl %esi
+; FALLBACK22-NEXT:    subl $108, %esp
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT:    movups (%ecx), %xmm0
+; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT:    movzbl (%eax), %ecx
+; FALLBACK22-NEXT:    movl %ecx, %edx
+; FALLBACK22-NEXT:    shlb $3, %dl
+; FALLBACK22-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    andb $28, %cl
+; FALLBACK22-NEXT:    movzbl %cl, %edi
+; FALLBACK22-NEXT:    shrxl %edx, 32(%esp,%edi), %ecx
+; FALLBACK22-NEXT:    movl %edx, %eax
+; FALLBACK22-NEXT:    notb %al
+; FALLBACK22-NEXT:    movl 36(%esp,%edi), %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %esi, %esi
+; FALLBACK22-NEXT:    shlxl %eax, %esi, %esi
+; FALLBACK22-NEXT:    orl %ecx, %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 48(%esp,%edi), %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %ecx, %ecx
+; FALLBACK22-NEXT:    shlxl %eax, %ecx, %esi
+; FALLBACK22-NEXT:    movl %eax, %ebp
+; FALLBACK22-NEXT:    movl 44(%esp,%edi), %ecx
+; FALLBACK22-NEXT:    shrxl %edx, %ecx, %ebx
+; FALLBACK22-NEXT:    orl %ebx, %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %ecx, %ecx
+; FALLBACK22-NEXT:    shlxl %eax, %ecx, %esi
+; FALLBACK22-NEXT:    movl 40(%esp,%edi), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, %eax, %ebx
+; FALLBACK22-NEXT:    orl %ebx, %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 56(%esp,%edi), %esi
+; FALLBACK22-NEXT:    leal (%esi,%esi), %ebx
+; FALLBACK22-NEXT:    shlxl %ebp, %ebx, %eax
+; FALLBACK22-NEXT:    movl %ebp, %ecx
+; FALLBACK22-NEXT:    movl 52(%esp,%edi), %ebx
+; FALLBACK22-NEXT:    shrxl %edx, %ebx, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK22-NEXT:    addl %ebx, %ebx
+; FALLBACK22-NEXT:    shlxl %ecx, %ebx, %ebx
+; FALLBACK22-NEXT:    orl %ebp, %ebx
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %ebp
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl 60(%esp,%edi), %edi
+; FALLBACK22-NEXT:    shrxl %edx, %edi, %eax
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    movl %ecx, %edx
+; FALLBACK22-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK22-NEXT:    orl %ebp, %edi
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    addl %ecx, %ecx
+; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ecx
+; FALLBACK22-NEXT:    orl %esi, %ecx
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK22-NEXT:    movl %eax, 28(%edx)
+; FALLBACK22-NEXT:    movl %ecx, 4(%edx)
+; FALLBACK22-NEXT:    movl %edi, 24(%edx)
+; FALLBACK22-NEXT:    movl %ebx, 16(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 20(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 8(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 12(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, (%edx)
+; FALLBACK22-NEXT:    addl $108, %esp
+; FALLBACK22-NEXT:    popl %esi
+; FALLBACK22-NEXT:    popl %edi
+; FALLBACK22-NEXT:    popl %ebx
+; FALLBACK22-NEXT:    popl %ebp
+; FALLBACK22-NEXT:    retl
+;
+; FALLBACK23-LABEL: lshr_32bytes:
+; FALLBACK23:       # %bb.0:
+; FALLBACK23-NEXT:    pushl %ebp
+; FALLBACK23-NEXT:    pushl %ebx
+; FALLBACK23-NEXT:    pushl %edi
+; FALLBACK23-NEXT:    pushl %esi
+; FALLBACK23-NEXT:    subl $108, %esp
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT:    movups (%ecx), %xmm0
+; FALLBACK23-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK23-NEXT:    movzbl (%eax), %eax
+; FALLBACK23-NEXT:    movl %eax, %ecx
+; FALLBACK23-NEXT:    shlb $3, %cl
+; FALLBACK23-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    andb $28, %al
+; FALLBACK23-NEXT:    movzbl %al, %ebx
+; FALLBACK23-NEXT:    movl 48(%esp,%ebx), %esi
+; FALLBACK23-NEXT:    movl 44(%esp,%ebx), %eax
+; FALLBACK23-NEXT:    movl %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 40(%esp,%ebx), %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 56(%esp,%ebx), %ebp
+; FALLBACK23-NEXT:    movl 52(%esp,%ebx), %eax
+; FALLBACK23-NEXT:    movl %eax, %edi
+; FALLBACK23-NEXT:    shrdl %cl, %ebp, %edi
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT:    movl 60(%esp,%ebx), %eax
+; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %ebp
+; FALLBACK23-NEXT:    movl 32(%esp,%ebx), %edx
+; FALLBACK23-NEXT:    movl 36(%esp,%ebx), %ebx
+; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT:    movl %ebx, 4(%eax)
+; FALLBACK23-NEXT:    movl %ebp, 24(%eax)
+; FALLBACK23-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK23-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK23-NEXT:    movl %esi, 16(%eax)
+; FALLBACK23-NEXT:    movl %edi, 20(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT:    movl %esi, 8(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT:    movl %esi, 12(%eax)
+; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT:    movl %edx, (%eax)
+; FALLBACK23-NEXT:    addl $108, %esp
+; FALLBACK23-NEXT:    popl %esi
+; FALLBACK23-NEXT:    popl %edi
+; FALLBACK23-NEXT:    popl %ebx
+; FALLBACK23-NEXT:    popl %ebp
+; FALLBACK23-NEXT:    retl
+;
+; FALLBACK24-LABEL: lshr_32bytes:
+; FALLBACK24:       # %bb.0:
+; FALLBACK24-NEXT:    pushl %ebp
+; FALLBACK24-NEXT:    pushl %ebx
+; FALLBACK24-NEXT:    pushl %edi
+; FALLBACK24-NEXT:    pushl %esi
+; FALLBACK24-NEXT:    subl $108, %esp
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT:    movzbl (%eax), %ecx
+; FALLBACK24-NEXT:    movl %ecx, %eax
+; FALLBACK24-NEXT:    shlb $3, %al
+; FALLBACK24-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK24-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    andb $28, %cl
+; FALLBACK24-NEXT:    movzbl %cl, %ecx
+; FALLBACK24-NEXT:    movl 32(%esp,%ecx), %esi
+; FALLBACK24-NEXT:    movl 36(%esp,%ecx), %ebx
+; FALLBACK24-NEXT:    movl %ecx, %edi
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    movl %eax, %edx
+; FALLBACK24-NEXT:    notb %dl
+; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %esi, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 44(%esp,%edi), %ebp
+; FALLBACK24-NEXT:    movl %ebp, %esi
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    movl 48(%esp,%edi), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %esi, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 40(%esp,%edi), %esi
+; FALLBACK24-NEXT:    movl %esi, %ebx
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    addl %ebp, %ebp
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    orl %ebx, %ebp
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 52(%esp,%edi), %ebp
+; FALLBACK24-NEXT:    movl %ebp, %ebx
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    movl 56(%esp,%edi), %ecx
+; FALLBACK24-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK24-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    orl %ebx, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    addl %ebp, %ebp
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    orl %edi, %ebp
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl 60(%esp,%ecx), %ebx
+; FALLBACK24-NEXT:    leal (%ebx,%ebx), %edi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    addl %esi, %esi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK24-NEXT:    movl %esi, 4(%eax)
+; FALLBACK24-NEXT:    movl %edi, 24(%eax)
+; FALLBACK24-NEXT:    movl %ebp, 16(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, (%eax)
+; FALLBACK24-NEXT:    addl $108, %esp
+; FALLBACK24-NEXT:    popl %esi
+; FALLBACK24-NEXT:    popl %edi
+; FALLBACK24-NEXT:    popl %ebx
+; FALLBACK24-NEXT:    popl %ebp
+; FALLBACK24-NEXT:    vzeroupper
+; FALLBACK24-NEXT:    retl
+;
+; FALLBACK25-LABEL: lshr_32bytes:
+; FALLBACK25:       # %bb.0:
+; FALLBACK25-NEXT:    pushl %ebp
+; FALLBACK25-NEXT:    pushl %ebx
+; FALLBACK25-NEXT:    pushl %edi
+; FALLBACK25-NEXT:    pushl %esi
+; FALLBACK25-NEXT:    subl $108, %esp
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK25-NEXT:    movzbl (%eax), %eax
+; FALLBACK25-NEXT:    movl %eax, %ecx
+; FALLBACK25-NEXT:    shlb $3, %cl
+; FALLBACK25-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK25-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    andb $28, %al
+; FALLBACK25-NEXT:    movzbl %al, %ebp
+; FALLBACK25-NEXT:    movl 48(%esp,%ebp), %esi
+; FALLBACK25-NEXT:    movl 44(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 40(%esp,%ebp), %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 56(%esp,%ebp), %ebx
+; FALLBACK25-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK25-NEXT:    movl 32(%esp,%ebp), %edx
+; FALLBACK25-NEXT:    movl 36(%esp,%ebp), %edi
+; FALLBACK25-NEXT:    movl %edi, %esi
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK25-NEXT:    shrdl %cl, %ebp, %esi
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT:    movl %esi, 4(%ebp)
+; FALLBACK25-NEXT:    movl %ebx, 24(%ebp)
+; FALLBACK25-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK25-NEXT:    shrl %cl, %eax
+; FALLBACK25-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK25-NEXT:    movl %edx, (%ebp)
+; FALLBACK25-NEXT:    addl $108, %esp
+; FALLBACK25-NEXT:    popl %esi
+; FALLBACK25-NEXT:    popl %edi
+; FALLBACK25-NEXT:    popl %ebx
+; FALLBACK25-NEXT:    popl %ebp
+; FALLBACK25-NEXT:    vzeroupper
+; FALLBACK25-NEXT:    retl
+;
+; FALLBACK26-LABEL: lshr_32bytes:
+; FALLBACK26:       # %bb.0:
+; FALLBACK26-NEXT:    pushl %ebp
+; FALLBACK26-NEXT:    pushl %ebx
+; FALLBACK26-NEXT:    pushl %edi
+; FALLBACK26-NEXT:    pushl %esi
+; FALLBACK26-NEXT:    subl $108, %esp
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT:    movzbl (%eax), %ecx
+; FALLBACK26-NEXT:    movl %ecx, %edx
+; FALLBACK26-NEXT:    shlb $3, %dl
+; FALLBACK26-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK26-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    andb $28, %cl
+; FALLBACK26-NEXT:    movzbl %cl, %edi
+; FALLBACK26-NEXT:    shrxl %edx, 32(%esp,%edi), %ecx
+; FALLBACK26-NEXT:    movl %edx, %eax
+; FALLBACK26-NEXT:    notb %al
+; FALLBACK26-NEXT:    movl 36(%esp,%edi), %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %esi, %esi
+; FALLBACK26-NEXT:    shlxl %eax, %esi, %esi
+; FALLBACK26-NEXT:    orl %ecx, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 48(%esp,%edi), %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %ecx, %ecx
+; FALLBACK26-NEXT:    shlxl %eax, %ecx, %esi
+; FALLBACK26-NEXT:    movl %eax, %ebp
+; FALLBACK26-NEXT:    movl 44(%esp,%edi), %ecx
+; FALLBACK26-NEXT:    shrxl %edx, %ecx, %ebx
+; FALLBACK26-NEXT:    orl %ebx, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %ecx, %ecx
+; FALLBACK26-NEXT:    shlxl %eax, %ecx, %esi
+; FALLBACK26-NEXT:    movl 40(%esp,%edi), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, %eax, %ebx
+; FALLBACK26-NEXT:    orl %ebx, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 56(%esp,%edi), %esi
+; FALLBACK26-NEXT:    leal (%esi,%esi), %ebx
+; FALLBACK26-NEXT:    shlxl %ebp, %ebx, %eax
+; FALLBACK26-NEXT:    movl %ebp, %ecx
+; FALLBACK26-NEXT:    movl 52(%esp,%edi), %ebx
+; FALLBACK26-NEXT:    shrxl %edx, %ebx, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK26-NEXT:    addl %ebx, %ebx
+; FALLBACK26-NEXT:    shlxl %ecx, %ebx, %ebx
+; FALLBACK26-NEXT:    orl %ebp, %ebx
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %ebp
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl 60(%esp,%edi), %edi
+; FALLBACK26-NEXT:    shrxl %edx, %edi, %eax
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    movl %ecx, %edx
+; FALLBACK26-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK26-NEXT:    orl %ebp, %edi
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    addl %ecx, %ecx
+; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ecx
+; FALLBACK26-NEXT:    orl %esi, %ecx
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK26-NEXT:    movl %eax, 28(%edx)
+; FALLBACK26-NEXT:    movl %ecx, 4(%edx)
+; FALLBACK26-NEXT:    movl %edi, 24(%edx)
+; FALLBACK26-NEXT:    movl %ebx, 16(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 20(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 8(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 12(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, (%edx)
+; FALLBACK26-NEXT:    addl $108, %esp
+; FALLBACK26-NEXT:    popl %esi
+; FALLBACK26-NEXT:    popl %edi
+; FALLBACK26-NEXT:    popl %ebx
+; FALLBACK26-NEXT:    popl %ebp
+; FALLBACK26-NEXT:    vzeroupper
+; FALLBACK26-NEXT:    retl
+;
+; FALLBACK27-LABEL: lshr_32bytes:
+; FALLBACK27:       # %bb.0:
+; FALLBACK27-NEXT:    pushl %ebp
+; FALLBACK27-NEXT:    pushl %ebx
+; FALLBACK27-NEXT:    pushl %edi
+; FALLBACK27-NEXT:    pushl %esi
+; FALLBACK27-NEXT:    subl $108, %esp
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK27-NEXT:    movzbl (%eax), %eax
+; FALLBACK27-NEXT:    movl %eax, %ecx
+; FALLBACK27-NEXT:    shlb $3, %cl
+; FALLBACK27-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK27-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    andb $28, %al
+; FALLBACK27-NEXT:    movzbl %al, %ebx
+; FALLBACK27-NEXT:    movl 48(%esp,%ebx), %esi
+; FALLBACK27-NEXT:    movl 44(%esp,%ebx), %eax
+; FALLBACK27-NEXT:    movl %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 40(%esp,%ebx), %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 56(%esp,%ebx), %ebp
+; FALLBACK27-NEXT:    movl 52(%esp,%ebx), %eax
+; FALLBACK27-NEXT:    movl %eax, %edi
+; FALLBACK27-NEXT:    shrdl %cl, %ebp, %edi
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT:    movl 60(%esp,%ebx), %eax
+; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %ebp
+; FALLBACK27-NEXT:    movl 32(%esp,%ebx), %edx
+; FALLBACK27-NEXT:    movl 36(%esp,%ebx), %ebx
+; FALLBACK27-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT:    movl %ebx, 4(%eax)
+; FALLBACK27-NEXT:    movl %ebp, 24(%eax)
+; FALLBACK27-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK27-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK27-NEXT:    movl %esi, 16(%eax)
+; FALLBACK27-NEXT:    movl %edi, 20(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT:    movl %esi, 8(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT:    movl %esi, 12(%eax)
+; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT:    movl %edx, (%eax)
+; FALLBACK27-NEXT:    addl $108, %esp
+; FALLBACK27-NEXT:    popl %esi
+; FALLBACK27-NEXT:    popl %edi
+; FALLBACK27-NEXT:    popl %ebx
+; FALLBACK27-NEXT:    popl %ebp
+; FALLBACK27-NEXT:    vzeroupper
+; FALLBACK27-NEXT:    retl
+;
+; FALLBACK28-LABEL: lshr_32bytes:
+; FALLBACK28:       # %bb.0:
+; FALLBACK28-NEXT:    pushl %ebp
+; FALLBACK28-NEXT:    pushl %ebx
+; FALLBACK28-NEXT:    pushl %edi
+; FALLBACK28-NEXT:    pushl %esi
+; FALLBACK28-NEXT:    subl $108, %esp
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK28-NEXT:    movzbl (%eax), %ecx
+; FALLBACK28-NEXT:    movl %ecx, %eax
+; FALLBACK28-NEXT:    shlb $3, %al
+; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    andb $28, %cl
+; FALLBACK28-NEXT:    movzbl %cl, %ecx
+; FALLBACK28-NEXT:    movl 32(%esp,%ecx), %esi
+; FALLBACK28-NEXT:    movl 36(%esp,%ecx), %ebx
+; FALLBACK28-NEXT:    movl %ecx, %edi
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    movl %eax, %edx
+; FALLBACK28-NEXT:    notb %dl
+; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %esi, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 44(%esp,%edi), %ebp
+; FALLBACK28-NEXT:    movl %ebp, %esi
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    movl 48(%esp,%edi), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %esi, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 40(%esp,%edi), %esi
+; FALLBACK28-NEXT:    movl %esi, %ebx
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    addl %ebp, %ebp
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    orl %ebx, %ebp
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 52(%esp,%edi), %ebp
+; FALLBACK28-NEXT:    movl %ebp, %ebx
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    movl 56(%esp,%edi), %ecx
+; FALLBACK28-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK28-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    orl %ebx, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    addl %ebp, %ebp
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    orl %edi, %ebp
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl 60(%esp,%ecx), %ebx
+; FALLBACK28-NEXT:    leal (%ebx,%ebx), %edi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    addl %esi, %esi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK28-NEXT:    movl %esi, 4(%eax)
+; FALLBACK28-NEXT:    movl %edi, 24(%eax)
+; FALLBACK28-NEXT:    movl %ebp, 16(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, (%eax)
+; FALLBACK28-NEXT:    addl $108, %esp
+; FALLBACK28-NEXT:    popl %esi
+; FALLBACK28-NEXT:    popl %edi
+; FALLBACK28-NEXT:    popl %ebx
+; FALLBACK28-NEXT:    popl %ebp
+; FALLBACK28-NEXT:    vzeroupper
+; FALLBACK28-NEXT:    retl
+;
+; FALLBACK29-LABEL: lshr_32bytes:
+; FALLBACK29:       # %bb.0:
+; FALLBACK29-NEXT:    pushl %ebp
+; FALLBACK29-NEXT:    pushl %ebx
+; FALLBACK29-NEXT:    pushl %edi
+; FALLBACK29-NEXT:    pushl %esi
+; FALLBACK29-NEXT:    subl $108, %esp
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK29-NEXT:    movzbl (%eax), %eax
+; FALLBACK29-NEXT:    movl %eax, %ecx
+; FALLBACK29-NEXT:    shlb $3, %cl
+; FALLBACK29-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    andb $28, %al
+; FALLBACK29-NEXT:    movzbl %al, %ebp
+; FALLBACK29-NEXT:    movl 48(%esp,%ebp), %esi
+; FALLBACK29-NEXT:    movl 44(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 40(%esp,%ebp), %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 56(%esp,%ebp), %ebx
+; FALLBACK29-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK29-NEXT:    movl 32(%esp,%ebp), %edx
+; FALLBACK29-NEXT:    movl 36(%esp,%ebp), %edi
+; FALLBACK29-NEXT:    movl %edi, %esi
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK29-NEXT:    shrdl %cl, %ebp, %esi
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT:    movl %esi, 4(%ebp)
+; FALLBACK29-NEXT:    movl %ebx, 24(%ebp)
+; FALLBACK29-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK29-NEXT:    shrl %cl, %eax
+; FALLBACK29-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK29-NEXT:    movl %edx, (%ebp)
+; FALLBACK29-NEXT:    addl $108, %esp
+; FALLBACK29-NEXT:    popl %esi
+; FALLBACK29-NEXT:    popl %edi
+; FALLBACK29-NEXT:    popl %ebx
+; FALLBACK29-NEXT:    popl %ebp
+; FALLBACK29-NEXT:    vzeroupper
+; FALLBACK29-NEXT:    retl
+;
+; FALLBACK30-LABEL: lshr_32bytes:
+; FALLBACK30:       # %bb.0:
+; FALLBACK30-NEXT:    pushl %ebp
+; FALLBACK30-NEXT:    pushl %ebx
+; FALLBACK30-NEXT:    pushl %edi
+; FALLBACK30-NEXT:    pushl %esi
+; FALLBACK30-NEXT:    subl $108, %esp
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK30-NEXT:    movzbl (%eax), %ecx
+; FALLBACK30-NEXT:    movl %ecx, %edx
+; FALLBACK30-NEXT:    shlb $3, %dl
+; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    andb $28, %cl
+; FALLBACK30-NEXT:    movzbl %cl, %edi
+; FALLBACK30-NEXT:    shrxl %edx, 32(%esp,%edi), %ecx
+; FALLBACK30-NEXT:    movl %edx, %eax
+; FALLBACK30-NEXT:    notb %al
+; FALLBACK30-NEXT:    movl 36(%esp,%edi), %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %esi, %esi
+; FALLBACK30-NEXT:    shlxl %eax, %esi, %esi
+; FALLBACK30-NEXT:    orl %ecx, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 48(%esp,%edi), %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %ecx, %ecx
+; FALLBACK30-NEXT:    shlxl %eax, %ecx, %esi
+; FALLBACK30-NEXT:    movl %eax, %ebp
+; FALLBACK30-NEXT:    movl 44(%esp,%edi), %ecx
+; FALLBACK30-NEXT:    shrxl %edx, %ecx, %ebx
+; FALLBACK30-NEXT:    orl %ebx, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %ecx, %ecx
+; FALLBACK30-NEXT:    shlxl %eax, %ecx, %esi
+; FALLBACK30-NEXT:    movl 40(%esp,%edi), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edx, %eax, %ebx
+; FALLBACK30-NEXT:    orl %ebx, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 56(%esp,%edi), %esi
+; FALLBACK30-NEXT:    leal (%esi,%esi), %ebx
+; FALLBACK30-NEXT:    shlxl %ebp, %ebx, %eax
+; FALLBACK30-NEXT:    movl %ebp, %ecx
+; FALLBACK30-NEXT:    movl 52(%esp,%edi), %ebx
+; FALLBACK30-NEXT:    shrxl %edx, %ebx, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK30-NEXT:    addl %ebx, %ebx
+; FALLBACK30-NEXT:    shlxl %ecx, %ebx, %ebx
+; FALLBACK30-NEXT:    orl %ebp, %ebx
+; FALLBACK30-NEXT:    shrxl %edx, %esi, %ebp
+; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl 60(%esp,%edi), %edi
+; FALLBACK30-NEXT:    shrxl %edx, %edi, %eax
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    movl %ecx, %edx
+; FALLBACK30-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK30-NEXT:    orl %ebp, %edi
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    addl %ecx, %ecx
+; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ecx
+; FALLBACK30-NEXT:    orl %esi, %ecx
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK30-NEXT:    movl %eax, 28(%edx)
+; FALLBACK30-NEXT:    movl %ecx, 4(%edx)
+; FALLBACK30-NEXT:    movl %edi, 24(%edx)
+; FALLBACK30-NEXT:    movl %ebx, 16(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 20(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 8(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 12(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, (%edx)
+; FALLBACK30-NEXT:    addl $108, %esp
+; FALLBACK30-NEXT:    popl %esi
+; FALLBACK30-NEXT:    popl %edi
+; FALLBACK30-NEXT:    popl %ebx
+; FALLBACK30-NEXT:    popl %ebp
+; FALLBACK30-NEXT:    vzeroupper
+; FALLBACK30-NEXT:    retl
+;
+; FALLBACK31-LABEL: lshr_32bytes:
+; FALLBACK31:       # %bb.0:
+; FALLBACK31-NEXT:    pushl %ebp
+; FALLBACK31-NEXT:    pushl %ebx
+; FALLBACK31-NEXT:    pushl %edi
+; FALLBACK31-NEXT:    pushl %esi
+; FALLBACK31-NEXT:    subl $108, %esp
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK31-NEXT:    movzbl (%eax), %eax
+; FALLBACK31-NEXT:    movl %eax, %ecx
+; FALLBACK31-NEXT:    shlb $3, %cl
+; FALLBACK31-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    andb $28, %al
+; FALLBACK31-NEXT:    movzbl %al, %ebx
+; FALLBACK31-NEXT:    movl 48(%esp,%ebx), %esi
+; FALLBACK31-NEXT:    movl 44(%esp,%ebx), %eax
+; FALLBACK31-NEXT:    movl %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 40(%esp,%ebx), %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 56(%esp,%ebx), %ebp
+; FALLBACK31-NEXT:    movl 52(%esp,%ebx), %eax
+; FALLBACK31-NEXT:    movl %eax, %edi
+; FALLBACK31-NEXT:    shrdl %cl, %ebp, %edi
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT:    movl 60(%esp,%ebx), %eax
+; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %ebp
+; FALLBACK31-NEXT:    movl 32(%esp,%ebx), %edx
+; FALLBACK31-NEXT:    movl 36(%esp,%ebx), %ebx
+; FALLBACK31-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT:    movl %ebx, 4(%eax)
+; FALLBACK31-NEXT:    movl %ebp, 24(%eax)
+; FALLBACK31-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK31-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK31-NEXT:    movl %esi, 16(%eax)
+; FALLBACK31-NEXT:    movl %edi, 20(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT:    movl %esi, 8(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT:    movl %esi, 12(%eax)
+; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT:    movl %edx, (%eax)
+; FALLBACK31-NEXT:    addl $108, %esp
+; FALLBACK31-NEXT:    popl %esi
+; FALLBACK31-NEXT:    popl %edi
+; FALLBACK31-NEXT:    popl %ebx
+; FALLBACK31-NEXT:    popl %ebp
+; FALLBACK31-NEXT:    vzeroupper
+; FALLBACK31-NEXT:    retl
+  %src = load i256, ptr %src.ptr, align 1
+  %byteOff = load i256, ptr %byteOff.ptr, align 1
+  %bitOff = shl i256 %byteOff, 3
+  %res = lshr i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; FALLBACK0-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK0:       # %bb.0:
+; FALLBACK0-NEXT:    pushq %rbx
+; FALLBACK0-NEXT:    movq (%rdi), %rcx
+; FALLBACK0-NEXT:    movq 8(%rdi), %r8
+; FALLBACK0-NEXT:    movq 16(%rdi), %r9
+; FALLBACK0-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK0-NEXT:    movzbl (%rsi), %esi
+; FALLBACK0-NEXT:    movl %esi, %eax
+; FALLBACK0-NEXT:    shlb $5, %al
+; FALLBACK0-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    andb $6, %sil
+; FALLBACK0-NEXT:    movzbl %sil, %r9d
+; FALLBACK0-NEXT:    movq -64(%rsp,%r9,4), %r10
+; FALLBACK0-NEXT:    movq -56(%rsp,%r9,4), %rdi
+; FALLBACK0-NEXT:    movq %rdi, %r11
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r11
+; FALLBACK0-NEXT:    movl %eax, %esi
+; FALLBACK0-NEXT:    notb %sil
+; FALLBACK0-NEXT:    movq -48(%rsp,%r9,4), %rbx
+; FALLBACK0-NEXT:    leaq (%rbx,%rbx), %r8
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r8
+; FALLBACK0-NEXT:    orq %r11, %r8
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r10
+; FALLBACK0-NEXT:    addq %rdi, %rdi
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %rdi
+; FALLBACK0-NEXT:    orq %r10, %rdi
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %rbx
+; FALLBACK0-NEXT:    movq -40(%rsp,%r9,4), %r9
+; FALLBACK0-NEXT:    leaq (%r9,%r9), %r10
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r10
+; FALLBACK0-NEXT:    orq %rbx, %r10
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r9
+; FALLBACK0-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK0-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK0-NEXT:    movq %rdi, (%rdx)
+; FALLBACK0-NEXT:    movq %r8, 8(%rdx)
+; FALLBACK0-NEXT:    popq %rbx
+; FALLBACK0-NEXT:    retq
+;
+; FALLBACK1-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK1:       # %bb.0:
+; FALLBACK1-NEXT:    movq (%rdi), %rax
+; FALLBACK1-NEXT:    movq 8(%rdi), %r8
+; FALLBACK1-NEXT:    movq 16(%rdi), %r9
+; FALLBACK1-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK1-NEXT:    movzbl (%rsi), %esi
+; FALLBACK1-NEXT:    movl %esi, %ecx
+; FALLBACK1-NEXT:    shlb $5, %cl
+; FALLBACK1-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    andb $6, %sil
+; FALLBACK1-NEXT:    movzbl %sil, %eax
+; FALLBACK1-NEXT:    movq -56(%rsp,%rax,4), %rsi
+; FALLBACK1-NEXT:    movq -72(%rsp,%rax,4), %rdi
+; FALLBACK1-NEXT:    movq -64(%rsp,%rax,4), %r8
+; FALLBACK1-NEXT:    movq %r8, %r9
+; FALLBACK1-NEXT:    shrdq %cl, %rsi, %r9
+; FALLBACK1-NEXT:    movq -48(%rsp,%rax,4), %rax
+; FALLBACK1-NEXT:    shrdq %cl, %rax, %rsi
+; FALLBACK1-NEXT:    shrdq %cl, %r8, %rdi
+; FALLBACK1-NEXT:    shrq %cl, %rax
+; FALLBACK1-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK1-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK1-NEXT:    movq %rdi, (%rdx)
+; FALLBACK1-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK1-NEXT:    retq
+;
+; FALLBACK2-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK2:       # %bb.0:
+; FALLBACK2-NEXT:    movq (%rdi), %rcx
+; FALLBACK2-NEXT:    movq 8(%rdi), %r8
+; FALLBACK2-NEXT:    movq 16(%rdi), %r9
+; FALLBACK2-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK2-NEXT:    movzbl (%rsi), %esi
+; FALLBACK2-NEXT:    movl %esi, %eax
+; FALLBACK2-NEXT:    shlb $5, %al
+; FALLBACK2-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    andb $6, %sil
+; FALLBACK2-NEXT:    movzbl %sil, %ecx
+; FALLBACK2-NEXT:    movq -64(%rsp,%rcx,4), %rsi
+; FALLBACK2-NEXT:    movq -56(%rsp,%rcx,4), %rdi
+; FALLBACK2-NEXT:    shrxq %rax, %rsi, %r8
+; FALLBACK2-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %r9
+; FALLBACK2-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK2-NEXT:    movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK2-NEXT:    shrxq %rax, %rcx, %r11
+; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    addq %rdi, %rdi
+; FALLBACK2-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK2-NEXT:    orq %r8, %rdi
+; FALLBACK2-NEXT:    addq %rsi, %rsi
+; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT:    orq %r9, %rsi
+; FALLBACK2-NEXT:    addq %rcx, %rcx
+; FALLBACK2-NEXT:    shlxq %rax, %rcx, %rax
+; FALLBACK2-NEXT:    orq %r10, %rax
+; FALLBACK2-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
+; FALLBACK2-NEXT:    movq %rsi, (%rdx)
+; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT:    retq
+;
+; FALLBACK3-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK3:       # %bb.0:
+; FALLBACK3-NEXT:    movq (%rdi), %rax
+; FALLBACK3-NEXT:    movq 8(%rdi), %r8
+; FALLBACK3-NEXT:    movq 16(%rdi), %r9
+; FALLBACK3-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK3-NEXT:    movzbl (%rsi), %esi
+; FALLBACK3-NEXT:    movl %esi, %ecx
+; FALLBACK3-NEXT:    shlb $5, %cl
+; FALLBACK3-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    andb $6, %sil
+; FALLBACK3-NEXT:    movzbl %sil, %eax
+; FALLBACK3-NEXT:    movq -56(%rsp,%rax,4), %rsi
+; FALLBACK3-NEXT:    movq -72(%rsp,%rax,4), %rdi
+; FALLBACK3-NEXT:    movq -64(%rsp,%rax,4), %r8
+; FALLBACK3-NEXT:    movq %r8, %r9
+; FALLBACK3-NEXT:    shrdq %cl, %rsi, %r9
+; FALLBACK3-NEXT:    movq -48(%rsp,%rax,4), %rax
+; FALLBACK3-NEXT:    shrdq %cl, %rax, %rsi
+; FALLBACK3-NEXT:    shrdq %cl, %r8, %rdi
+; FALLBACK3-NEXT:    shrxq %rcx, %rax, %rax
+; FALLBACK3-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK3-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK3-NEXT:    movq %rdi, (%rdx)
+; FALLBACK3-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK3-NEXT:    retq
+;
+; FALLBACK4-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK4:       # %bb.0:
+; FALLBACK4-NEXT:    pushq %rbx
+; FALLBACK4-NEXT:    movups (%rdi), %xmm0
+; FALLBACK4-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK4-NEXT:    movl %ecx, %eax
+; FALLBACK4-NEXT:    shlb $5, %al
+; FALLBACK4-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    andb $6, %cl
+; FALLBACK4-NEXT:    movzbl %cl, %r9d
+; FALLBACK4-NEXT:    movq -64(%rsp,%r9,4), %r10
+; FALLBACK4-NEXT:    movq -56(%rsp,%r9,4), %r8
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r10
+; FALLBACK4-NEXT:    movl %eax, %esi
+; FALLBACK4-NEXT:    notb %sil
+; FALLBACK4-NEXT:    leaq (%r8,%r8), %rdi
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %rdi
+; FALLBACK4-NEXT:    orq %r10, %rdi
+; FALLBACK4-NEXT:    movq -48(%rsp,%r9,4), %r10
+; FALLBACK4-NEXT:    movq %r10, %r11
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r11
+; FALLBACK4-NEXT:    movq -40(%rsp,%r9,4), %r9
+; FALLBACK4-NEXT:    leaq (%r9,%r9), %rbx
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %rbx
+; FALLBACK4-NEXT:    orq %r11, %rbx
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r8
+; FALLBACK4-NEXT:    addq %r10, %r10
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r10
+; FALLBACK4-NEXT:    orq %r8, %r10
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r9
+; FALLBACK4-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK4-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK4-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK4-NEXT:    movq %rdi, (%rdx)
+; FALLBACK4-NEXT:    popq %rbx
+; FALLBACK4-NEXT:    retq
+;
+; FALLBACK5-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK5:       # %bb.0:
+; FALLBACK5-NEXT:    movups (%rdi), %xmm0
+; FALLBACK5-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT:    movzbl (%rsi), %eax
+; FALLBACK5-NEXT:    movl %eax, %ecx
+; FALLBACK5-NEXT:    shlb $5, %cl
+; FALLBACK5-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    andb $6, %al
+; FALLBACK5-NEXT:    movzbl %al, %eax
+; FALLBACK5-NEXT:    movq -48(%rsp,%rax,4), %rsi
+; FALLBACK5-NEXT:    movq -56(%rsp,%rax,4), %rdi
+; FALLBACK5-NEXT:    movq %rdi, %r8
+; FALLBACK5-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK5-NEXT:    movq -72(%rsp,%rax,4), %r9
+; FALLBACK5-NEXT:    movq -64(%rsp,%rax,4), %rax
+; FALLBACK5-NEXT:    movq %rax, %r10
+; FALLBACK5-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK5-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK5-NEXT:    shrq %cl, %rsi
+; FALLBACK5-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK5-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK5-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT:    movq %r9, (%rdx)
+; FALLBACK5-NEXT:    retq
+;
+; FALLBACK6-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK6:       # %bb.0:
+; FALLBACK6-NEXT:    movups (%rdi), %xmm0
+; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK6-NEXT:    movl %ecx, %eax
+; FALLBACK6-NEXT:    shlb $5, %al
+; FALLBACK6-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    andb $6, %cl
+; FALLBACK6-NEXT:    movzbl %cl, %ecx
+; FALLBACK6-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
+; FALLBACK6-NEXT:    movq -64(%rsp,%rcx,4), %rdi
+; FALLBACK6-NEXT:    movq -56(%rsp,%rcx,4), %r8
+; FALLBACK6-NEXT:    shrxq %rax, %r8, %r9
+; FALLBACK6-NEXT:    movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK6-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK6-NEXT:    shrxq %rax, %rcx, %r11
+; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT:    notb %al
+; FALLBACK6-NEXT:    addq %rdi, %rdi
+; FALLBACK6-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT:    orq %rsi, %rdi
+; FALLBACK6-NEXT:    addq %rcx, %rcx
+; FALLBACK6-NEXT:    shlxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT:    orq %r9, %rcx
+; FALLBACK6-NEXT:    addq %r8, %r8
+; FALLBACK6-NEXT:    shlxq %rax, %r8, %rax
+; FALLBACK6-NEXT:    orq %r10, %rax
+; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT:    movq %rdi, (%rdx)
+; FALLBACK6-NEXT:    retq
+;
+; FALLBACK7-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK7:       # %bb.0:
+; FALLBACK7-NEXT:    movups (%rdi), %xmm0
+; FALLBACK7-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT:    movzbl (%rsi), %eax
+; FALLBACK7-NEXT:    movl %eax, %ecx
+; FALLBACK7-NEXT:    shlb $5, %cl
+; FALLBACK7-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    andb $6, %al
+; FALLBACK7-NEXT:    movzbl %al, %eax
+; FALLBACK7-NEXT:    movq -48(%rsp,%rax,4), %rsi
+; FALLBACK7-NEXT:    movq -56(%rsp,%rax,4), %rdi
+; FALLBACK7-NEXT:    movq %rdi, %r8
+; FALLBACK7-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK7-NEXT:    movq -72(%rsp,%rax,4), %r9
+; FALLBACK7-NEXT:    movq -64(%rsp,%rax,4), %rax
+; FALLBACK7-NEXT:    movq %rax, %r10
+; FALLBACK7-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK7-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK7-NEXT:    shrxq %rcx, %rsi, %rax
+; FALLBACK7-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK7-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK7-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK7-NEXT:    movq %r9, (%rdx)
+; FALLBACK7-NEXT:    retq
+;
+; FALLBACK8-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK8:       # %bb.0:
+; FALLBACK8-NEXT:    pushq %rbx
+; FALLBACK8-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK8-NEXT:    movl %ecx, %eax
+; FALLBACK8-NEXT:    shlb $5, %al
+; FALLBACK8-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK8-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    andb $6, %cl
+; FALLBACK8-NEXT:    movzbl %cl, %r9d
+; FALLBACK8-NEXT:    movq -64(%rsp,%r9,4), %r10
+; FALLBACK8-NEXT:    movq -56(%rsp,%r9,4), %r8
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r10
+; FALLBACK8-NEXT:    movl %eax, %esi
+; FALLBACK8-NEXT:    notb %sil
+; FALLBACK8-NEXT:    leaq (%r8,%r8), %rdi
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %rdi
+; FALLBACK8-NEXT:    orq %r10, %rdi
+; FALLBACK8-NEXT:    movq -48(%rsp,%r9,4), %r10
+; FALLBACK8-NEXT:    movq %r10, %r11
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r11
+; FALLBACK8-NEXT:    movq -40(%rsp,%r9,4), %r9
+; FALLBACK8-NEXT:    leaq (%r9,%r9), %rbx
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %rbx
+; FALLBACK8-NEXT:    orq %r11, %rbx
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r8
+; FALLBACK8-NEXT:    addq %r10, %r10
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r10
+; FALLBACK8-NEXT:    orq %r8, %r10
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r9
+; FALLBACK8-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK8-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK8-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK8-NEXT:    movq %rdi, (%rdx)
+; FALLBACK8-NEXT:    popq %rbx
+; FALLBACK8-NEXT:    vzeroupper
+; FALLBACK8-NEXT:    retq
+;
+; FALLBACK9-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK9:       # %bb.0:
+; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT:    movzbl (%rsi), %eax
+; FALLBACK9-NEXT:    movl %eax, %ecx
+; FALLBACK9-NEXT:    shlb $5, %cl
+; FALLBACK9-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK9-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    andb $6, %al
+; FALLBACK9-NEXT:    movzbl %al, %eax
+; FALLBACK9-NEXT:    movq -48(%rsp,%rax,4), %rsi
+; FALLBACK9-NEXT:    movq -56(%rsp,%rax,4), %rdi
+; FALLBACK9-NEXT:    movq %rdi, %r8
+; FALLBACK9-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK9-NEXT:    movq -72(%rsp,%rax,4), %r9
+; FALLBACK9-NEXT:    movq -64(%rsp,%rax,4), %rax
+; FALLBACK9-NEXT:    movq %rax, %r10
+; FALLBACK9-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK9-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK9-NEXT:    shrq %cl, %rsi
+; FALLBACK9-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK9-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK9-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT:    movq %r9, (%rdx)
+; FALLBACK9-NEXT:    vzeroupper
+; FALLBACK9-NEXT:    retq
+;
+; FALLBACK10-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK10:       # %bb.0:
+; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK10-NEXT:    movl %ecx, %eax
+; FALLBACK10-NEXT:    shlb $5, %al
+; FALLBACK10-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    andb $6, %cl
+; FALLBACK10-NEXT:    movzbl %cl, %ecx
+; FALLBACK10-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
+; FALLBACK10-NEXT:    movq -64(%rsp,%rcx,4), %rdi
+; FALLBACK10-NEXT:    movq -56(%rsp,%rcx,4), %r8
+; FALLBACK10-NEXT:    shrxq %rax, %r8, %r9
+; FALLBACK10-NEXT:    movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK10-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK10-NEXT:    shrxq %rax, %rcx, %r11
+; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT:    notb %al
+; FALLBACK10-NEXT:    addq %rdi, %rdi
+; FALLBACK10-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT:    orq %rsi, %rdi
+; FALLBACK10-NEXT:    addq %rcx, %rcx
+; FALLBACK10-NEXT:    shlxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT:    orq %r9, %rcx
+; FALLBACK10-NEXT:    addq %r8, %r8
+; FALLBACK10-NEXT:    shlxq %rax, %r8, %rax
+; FALLBACK10-NEXT:    orq %r10, %rax
+; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT:    movq %rdi, (%rdx)
+; FALLBACK10-NEXT:    vzeroupper
+; FALLBACK10-NEXT:    retq
+;
+; FALLBACK11-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK11:       # %bb.0:
+; FALLBACK11-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT:    movzbl (%rsi), %eax
+; FALLBACK11-NEXT:    movl %eax, %ecx
+; FALLBACK11-NEXT:    shlb $5, %cl
+; FALLBACK11-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK11-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    andb $6, %al
+; FALLBACK11-NEXT:    movzbl %al, %eax
+; FALLBACK11-NEXT:    movq -48(%rsp,%rax,4), %rsi
+; FALLBACK11-NEXT:    movq -56(%rsp,%rax,4), %rdi
+; FALLBACK11-NEXT:    movq %rdi, %r8
+; FALLBACK11-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK11-NEXT:    movq -72(%rsp,%rax,4), %r9
+; FALLBACK11-NEXT:    movq -64(%rsp,%rax,4), %rax
+; FALLBACK11-NEXT:    movq %rax, %r10
+; FALLBACK11-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK11-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK11-NEXT:    shrxq %rcx, %rsi, %rax
+; FALLBACK11-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK11-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK11-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK11-NEXT:    movq %r9, (%rdx)
+; FALLBACK11-NEXT:    vzeroupper
+; FALLBACK11-NEXT:    retq
+;
+; FALLBACK12-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK12:       # %bb.0:
+; FALLBACK12-NEXT:    pushq %rbx
+; FALLBACK12-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK12-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK12-NEXT:    movl %ecx, %eax
+; FALLBACK12-NEXT:    shlb $5, %al
+; FALLBACK12-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    andb $6, %cl
+; FALLBACK12-NEXT:    movzbl %cl, %r9d
+; FALLBACK12-NEXT:    movq -64(%rsp,%r9,4), %r10
+; FALLBACK12-NEXT:    movq -56(%rsp,%r9,4), %r8
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r10
+; FALLBACK12-NEXT:    movl %eax, %esi
+; FALLBACK12-NEXT:    notb %sil
+; FALLBACK12-NEXT:    leaq (%r8,%r8), %rdi
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %rdi
+; FALLBACK12-NEXT:    orq %r10, %rdi
+; FALLBACK12-NEXT:    movq -48(%rsp,%r9,4), %r10
+; FALLBACK12-NEXT:    movq %r10, %r11
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r11
+; FALLBACK12-NEXT:    movq -40(%rsp,%r9,4), %r9
+; FALLBACK12-NEXT:    leaq (%r9,%r9), %rbx
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %rbx
+; FALLBACK12-NEXT:    orq %r11, %rbx
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r8
+; FALLBACK12-NEXT:    addq %r10, %r10
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r10
+; FALLBACK12-NEXT:    orq %r8, %r10
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r9
+; FALLBACK12-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK12-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK12-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK12-NEXT:    movq %rdi, (%rdx)
+; FALLBACK12-NEXT:    popq %rbx
+; FALLBACK12-NEXT:    vzeroupper
+; FALLBACK12-NEXT:    retq
+;
+; FALLBACK13-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK13:       # %bb.0:
+; FALLBACK13-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK13-NEXT:    movzbl (%rsi), %eax
+; FALLBACK13-NEXT:    movl %eax, %ecx
+; FALLBACK13-NEXT:    shlb $5, %cl
+; FALLBACK13-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    andb $6, %al
+; FALLBACK13-NEXT:    movzbl %al, %eax
+; FALLBACK13-NEXT:    movq -48(%rsp,%rax,4), %rsi
+; FALLBACK13-NEXT:    movq -56(%rsp,%rax,4), %rdi
+; FALLBACK13-NEXT:    movq %rdi, %r8
+; FALLBACK13-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK13-NEXT:    movq -72(%rsp,%rax,4), %r9
+; FALLBACK13-NEXT:    movq -64(%rsp,%rax,4), %rax
+; FALLBACK13-NEXT:    movq %rax, %r10
+; FALLBACK13-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK13-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK13-NEXT:    shrq %cl, %rsi
+; FALLBACK13-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK13-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK13-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK13-NEXT:    movq %r9, (%rdx)
+; FALLBACK13-NEXT:    vzeroupper
+; FALLBACK13-NEXT:    retq
+;
+; FALLBACK14-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK14:       # %bb.0:
+; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK14-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK14-NEXT:    movl %ecx, %eax
+; FALLBACK14-NEXT:    shlb $5, %al
+; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    andb $6, %cl
+; FALLBACK14-NEXT:    movzbl %cl, %ecx
+; FALLBACK14-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
+; FALLBACK14-NEXT:    movq -64(%rsp,%rcx,4), %rdi
+; FALLBACK14-NEXT:    movq -56(%rsp,%rcx,4), %r8
+; FALLBACK14-NEXT:    shrxq %rax, %r8, %r9
+; FALLBACK14-NEXT:    movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK14-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK14-NEXT:    shrxq %rax, %rcx, %r11
+; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT:    notb %al
+; FALLBACK14-NEXT:    addq %rdi, %rdi
+; FALLBACK14-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT:    orq %rsi, %rdi
+; FALLBACK14-NEXT:    addq %rcx, %rcx
+; FALLBACK14-NEXT:    shlxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT:    orq %r9, %rcx
+; FALLBACK14-NEXT:    addq %r8, %r8
+; FALLBACK14-NEXT:    shlxq %rax, %r8, %rax
+; FALLBACK14-NEXT:    orq %r10, %rax
+; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT:    movq %rdi, (%rdx)
+; FALLBACK14-NEXT:    vzeroupper
+; FALLBACK14-NEXT:    retq
+;
+; FALLBACK15-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK15:       # %bb.0:
+; FALLBACK15-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK15-NEXT:    movzbl (%rsi), %eax
+; FALLBACK15-NEXT:    movl %eax, %ecx
+; FALLBACK15-NEXT:    shlb $5, %cl
+; FALLBACK15-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    andb $6, %al
+; FALLBACK15-NEXT:    movzbl %al, %eax
+; FALLBACK15-NEXT:    movq -48(%rsp,%rax,4), %rsi
+; FALLBACK15-NEXT:    movq -56(%rsp,%rax,4), %rdi
+; FALLBACK15-NEXT:    movq %rdi, %r8
+; FALLBACK15-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK15-NEXT:    movq -72(%rsp,%rax,4), %r9
+; FALLBACK15-NEXT:    movq -64(%rsp,%rax,4), %rax
+; FALLBACK15-NEXT:    movq %rax, %r10
+; FALLBACK15-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK15-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK15-NEXT:    shrxq %rcx, %rsi, %rax
+; FALLBACK15-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK15-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK15-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK15-NEXT:    movq %r9, (%rdx)
+; FALLBACK15-NEXT:    vzeroupper
+; FALLBACK15-NEXT:    retq
+;
+; X86-SSE2-LABEL: lshr_32bytes_dwordOff:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    pushl %ebx
 ; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $72, %esp
+; X86-SSE2-NEXT:    subl $92, %esp
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl (%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SSE2-NEXT:    movl 8(%eax), %esi
 ; X86-SSE2-NEXT:    movl 12(%eax), %edi
 ; X86-SSE2-NEXT:    movl 16(%eax), %ebx
@@ -1148,35 +5664,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    movl 28(%eax), %ecx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andl $31, %eax
-; X86-SSE2-NEXT:    movl 8(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    andl $7, %eax
+; X86-SSE2-NEXT:    movl 16(%esp,%eax,4), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 20(%esp,%eax,4), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 12(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 20(%esp,%eax), %esi
-; X86-SSE2-NEXT:    movl 16(%esp,%eax), %edi
-; X86-SSE2-NEXT:    movl 28(%esp,%eax), %ebx
-; X86-SSE2-NEXT:    movl 24(%esp,%eax), %ebp
-; X86-SSE2-NEXT:    movl 36(%esp,%eax), %edx
-; X86-SSE2-NEXT:    movl 32(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 28(%esp,%eax,4), %esi
+; X86-SSE2-NEXT:    movl 24(%esp,%eax,4), %edi
+; X86-SSE2-NEXT:    movl 36(%esp,%eax,4), %ebx
+; X86-SSE2-NEXT:    movl 32(%esp,%eax,4), %ebp
+; X86-SSE2-NEXT:    movl 44(%esp,%eax,4), %edx
+; X86-SSE2-NEXT:    movl 40(%esp,%eax,4), %ecx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
 ; X86-SSE2-NEXT:    movl %edx, 28(%eax)
@@ -1186,18 +5697,18 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    movl %esi, 12(%eax)
 ; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    addl $72, %esp
+; X86-SSE2-NEXT:    addl $92, %esp
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    popl %ebx
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-SSE42-LABEL: lshr_32bytes:
+; X86-SSE42-LABEL: lshr_32bytes_dwordOff:
 ; X86-SSE42:       # %bb.0:
-; X86-SSE42-NEXT:    subl $64, %esp
+; X86-SSE42-NEXT:    subl $76, %esp
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1205,21 +5716,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
 ; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
 ; X86-SSE42-NEXT:    xorps %xmm2, %xmm2
-; X86-SSE42-NEXT:    movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm0, (%esp)
-; X86-SSE42-NEXT:    andl $31, %ecx
-; X86-SSE42-NEXT:    movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%esp,%ecx), %xmm1
+; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
+; X86-SSE42-NEXT:    andl $7, %ecx
+; X86-SSE42-NEXT:    movups (%esp,%ecx,4), %xmm0
+; X86-SSE42-NEXT:    movups 16(%esp,%ecx,4), %xmm1
 ; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
 ; X86-SSE42-NEXT:    movups %xmm0, (%eax)
-; X86-SSE42-NEXT:    addl $64, %esp
+; X86-SSE42-NEXT:    addl $76, %esp
 ; X86-SSE42-NEXT:    retl
 ;
-; X86-AVX-LABEL: lshr_32bytes:
+; X86-AVX-LABEL: lshr_32bytes_dwordOff:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    subl $64, %esp
+; X86-AVX-NEXT:    subl $76, %esp
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1228,137 +5739,2812 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    vmovups %ymm0, (%esp)
-; X86-AVX-NEXT:    andl $31, %ecx
-; X86-AVX-NEXT:    vmovups (%esp,%ecx), %xmm0
-; X86-AVX-NEXT:    vmovups 16(%esp,%ecx), %xmm1
+; X86-AVX-NEXT:    andl $7, %ecx
+; X86-AVX-NEXT:    vmovups (%esp,%ecx,4), %xmm0
+; X86-AVX-NEXT:    vmovups 16(%esp,%ecx,4), %xmm1
 ; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
 ; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX-NEXT:    addl $64, %esp
+; X86-AVX-NEXT:    addl $76, %esp
 ; X86-AVX-NEXT:    vzeroupper
 ; X86-AVX-NEXT:    retl
   %src = load i256, ptr %src.ptr, align 1
-  %byteOff = load i256, ptr %byteOff.ptr, align 1
-  %bitOff = shl i256 %byteOff, 3
+  %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+  %bitOff = shl i256 %dwordOff, 5
   %res = lshr i256 %src, %bitOff
   store i256 %res, ptr %dst, align 1
   ret void
 }
-define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: shl_32bytes:
+
+define void @lshr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
+; X64-SSE2-LABEL: lshr_32bytes_qwordOff:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
 ; X64-SSE2-NEXT:    movq 16(%rdi), %r8
 ; X64-SSE2-NEXT:    movq 24(%rdi), %rdi
 ; X64-SSE2-NEXT:    movzbl (%rsi), %esi
+; X64-SSE2-NEXT:    xorps %xmm0, %xmm0
+; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    andb $31, %sil
-; X64-SSE2-NEXT:    negb %sil
-; X64-SSE2-NEXT:    movsbq %sil, %rax
-; X64-SSE2-NEXT:    movq -32(%rsp,%rax), %rcx
-; X64-SSE2-NEXT:    movq -24(%rsp,%rax), %rsi
-; X64-SSE2-NEXT:    movq -8(%rsp,%rax), %rdi
-; X64-SSE2-NEXT:    movq -16(%rsp,%rax), %rax
-; X64-SSE2-NEXT:    movq %rax, 16(%rdx)
+; X64-SSE2-NEXT:    andl $3, %esi
+; X64-SSE2-NEXT:    movq -72(%rsp,%rsi,8), %rax
+; X64-SSE2-NEXT:    movq -64(%rsp,%rsi,8), %rcx
+; X64-SSE2-NEXT:    movq -48(%rsp,%rsi,8), %rdi
+; X64-SSE2-NEXT:    movq -56(%rsp,%rsi,8), %rsi
+; X64-SSE2-NEXT:    movq %rsi, 16(%rdx)
 ; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
-; X64-SSE2-NEXT:    movq %rcx, (%rdx)
-; X64-SSE2-NEXT:    movq %rsi, 8(%rdx)
+; X64-SSE2-NEXT:    movq %rax, (%rdx)
+; X64-SSE2-NEXT:    movq %rcx, 8(%rdx)
 ; X64-SSE2-NEXT:    retq
 ;
-; X64-SSE42-LABEL: shl_32bytes:
+; X64-SSE42-LABEL: lshr_32bytes_qwordOff:
 ; X64-SSE42:       # %bb.0:
 ; X64-SSE42-NEXT:    movups (%rdi), %xmm0
 ; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
 ; X64-SSE42-NEXT:    movzbl (%rsi), %eax
 ; X64-SSE42-NEXT:    xorps %xmm2, %xmm2
-; X64-SSE42-NEXT:    movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    andb $31, %al
-; X64-SSE42-NEXT:    negb %al
-; X64-SSE42-NEXT:    movsbq %al, %rax
-; X64-SSE42-NEXT:    movups -32(%rsp,%rax), %xmm0
-; X64-SSE42-NEXT:    movups -16(%rsp,%rax), %xmm1
+; X64-SSE42-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    andl $3, %eax
+; X64-SSE42-NEXT:    movups -72(%rsp,%rax,8), %xmm0
+; X64-SSE42-NEXT:    movups -56(%rsp,%rax,8), %xmm1
 ; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
 ; X64-SSE42-NEXT:    retq
 ;
-; X64-AVX-LABEL: shl_32bytes:
+; X64-AVX-LABEL: lshr_32bytes_qwordOff:
 ; X64-AVX:       # %bb.0:
 ; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-AVX-NEXT:    movzbl (%rsi), %eax
 ; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; X64-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    andb $31, %al
-; X64-AVX-NEXT:    negb %al
-; X64-AVX-NEXT:    movsbq %al, %rax
-; X64-AVX-NEXT:    vmovups -32(%rsp,%rax), %xmm0
-; X64-AVX-NEXT:    vmovups -16(%rsp,%rax), %xmm1
+; X64-AVX-NEXT:    andl $3, %eax
+; X64-AVX-NEXT:    vmovups -72(%rsp,%rax,8), %xmm0
+; X64-AVX-NEXT:    vmovups -56(%rsp,%rax,8), %xmm1
 ; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
 ; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
 ; X64-AVX-NEXT:    vzeroupper
 ; X64-AVX-NEXT:    retq
 ;
-; X86-SSE2-LABEL: shl_32bytes:
+; X86-SSE2-LABEL: lshr_32bytes_qwordOff:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    pushl %ebx
 ; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $72, %esp
+; X86-SSE2-NEXT:    subl $92, %esp
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT:    movl (%edi), %ecx
+; X86-SSE2-NEXT:    movl (%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 4(%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 4(%edi), %ecx
-; X86-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 8(%edi), %esi
-; X86-SSE2-NEXT:    movl 12(%edi), %ebx
-; X86-SSE2-NEXT:    movl 16(%edi), %ebp
+; X86-SSE2-NEXT:    movl 8(%eax), %esi
+; X86-SSE2-NEXT:    movl 12(%eax), %edi
+; X86-SSE2-NEXT:    movl 16(%eax), %ebx
+; X86-SSE2-NEXT:    movl 20(%eax), %ebp
+; X86-SSE2-NEXT:    movl 24(%eax), %edx
+; X86-SSE2-NEXT:    movl 28(%eax), %ecx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movzbl (%eax), %eax
-; X86-SSE2-NEXT:    movl 20(%edi), %edx
-; X86-SSE2-NEXT:    movl 24(%edi), %ecx
-; X86-SSE2-NEXT:    movl 28(%edi), %edi
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andb $31, %al
-; X86-SSE2-NEXT:    negb %al
-; X86-SSE2-NEXT:    movsbl %al, %edx
-; X86-SSE2-NEXT:    movl 40(%esp,%edx), %eax
+; X86-SSE2-NEXT:    andl $3, %eax
+; X86-SSE2-NEXT:    movl 16(%esp,%eax,8), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 20(%esp,%eax,8), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 28(%esp,%eax,8), %esi
+; X86-SSE2-NEXT:    movl 24(%esp,%eax,8), %edi
+; X86-SSE2-NEXT:    movl 36(%esp,%eax,8), %ebx
+; X86-SSE2-NEXT:    movl 32(%esp,%eax,8), %ebp
+; X86-SSE2-NEXT:    movl 44(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl 40(%esp,%eax,8), %ecx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
+; X86-SSE2-NEXT:    movl %edx, 28(%eax)
+; X86-SSE2-NEXT:    movl %ebp, 16(%eax)
+; X86-SSE2-NEXT:    movl %ebx, 20(%eax)
+; X86-SSE2-NEXT:    movl %edi, 8(%eax)
+; X86-SSE2-NEXT:    movl %esi, 12(%eax)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl %ecx, (%eax)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
+; X86-SSE2-NEXT:    addl $92, %esp
+; X86-SSE2-NEXT:    popl %esi
+; X86-SSE2-NEXT:    popl %edi
+; X86-SSE2-NEXT:    popl %ebx
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: lshr_32bytes_qwordOff:
+; X86-SSE42:       # %bb.0:
+; X86-SSE42-NEXT:    subl $76, %esp
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE42-NEXT:    movups (%edx), %xmm0
+; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
+; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
+; X86-SSE42-NEXT:    xorps %xmm2, %xmm2
+; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
+; X86-SSE42-NEXT:    andl $3, %ecx
+; X86-SSE42-NEXT:    movups (%esp,%ecx,8), %xmm0
+; X86-SSE42-NEXT:    movups 16(%esp,%ecx,8), %xmm1
+; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
+; X86-SSE42-NEXT:    movups %xmm0, (%eax)
+; X86-SSE42-NEXT:    addl $76, %esp
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX-LABEL: lshr_32bytes_qwordOff:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    subl $76, %esp
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT:    vmovups (%edx), %ymm0
+; X86-AVX-NEXT:    movzbl (%ecx), %ecx
+; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    vmovups %ymm0, (%esp)
+; X86-AVX-NEXT:    andl $3, %ecx
+; X86-AVX-NEXT:    vmovups (%esp,%ecx,8), %xmm0
+; X86-AVX-NEXT:    vmovups 16(%esp,%ecx,8), %xmm1
+; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
+; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
+; X86-AVX-NEXT:    addl $76, %esp
+; X86-AVX-NEXT:    vzeroupper
+; X86-AVX-NEXT:    retl
+  %src = load i256, ptr %src.ptr, align 1
+  %qwordOff = load i256, ptr %qwordOff.ptr, align 1
+  %bitOff = shl i256 %qwordOff, 6
+  %res = lshr i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; FALLBACK0-LABEL: shl_32bytes:
+; FALLBACK0:       # %bb.0:
+; FALLBACK0-NEXT:    pushq %rbx
+; FALLBACK0-NEXT:    movq (%rdi), %rcx
+; FALLBACK0-NEXT:    movq 8(%rdi), %r8
+; FALLBACK0-NEXT:    movq 16(%rdi), %r9
+; FALLBACK0-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK0-NEXT:    movzbl (%rsi), %esi
+; FALLBACK0-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK0-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    andb $24, %sil
+; FALLBACK0-NEXT:    negb %sil
+; FALLBACK0-NEXT:    movsbq %sil, %r10
+; FALLBACK0-NEXT:    movq -32(%rsp,%r10), %r8
+; FALLBACK0-NEXT:    movq -24(%rsp,%r10), %rdi
+; FALLBACK0-NEXT:    movq %rdi, %r11
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r11
+; FALLBACK0-NEXT:    movl %eax, %esi
+; FALLBACK0-NEXT:    notb %sil
+; FALLBACK0-NEXT:    movq %r8, %r9
+; FALLBACK0-NEXT:    shrq %r9
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r9
+; FALLBACK0-NEXT:    orq %r11, %r9
+; FALLBACK0-NEXT:    movq -8(%rsp,%r10), %r11
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r11
+; FALLBACK0-NEXT:    movq -16(%rsp,%r10), %r10
+; FALLBACK0-NEXT:    movq %r10, %rbx
+; FALLBACK0-NEXT:    shrq %rbx
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %rbx
+; FALLBACK0-NEXT:    orq %r11, %rbx
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r10
+; FALLBACK0-NEXT:    shrq %rdi
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %rdi
+; FALLBACK0-NEXT:    orq %r10, %rdi
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r8
+; FALLBACK0-NEXT:    movq %r8, (%rdx)
+; FALLBACK0-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK0-NEXT:    movq %rbx, 24(%rdx)
+; FALLBACK0-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK0-NEXT:    popq %rbx
+; FALLBACK0-NEXT:    retq
+;
+; FALLBACK1-LABEL: shl_32bytes:
+; FALLBACK1:       # %bb.0:
+; FALLBACK1-NEXT:    movq (%rdi), %rax
+; FALLBACK1-NEXT:    movq 8(%rdi), %r8
+; FALLBACK1-NEXT:    movq 16(%rdi), %r9
+; FALLBACK1-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK1-NEXT:    movzbl (%rsi), %esi
+; FALLBACK1-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK1-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    andb $24, %sil
+; FALLBACK1-NEXT:    negb %sil
+; FALLBACK1-NEXT:    movsbq %sil, %rax
+; FALLBACK1-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK1-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK1-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK1-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK1-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK1-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK1-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT:    shlq %cl, %r8
+; FALLBACK1-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK1-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK1-NEXT:    movq %r8, (%rdx)
+; FALLBACK1-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK1-NEXT:    retq
+;
+; FALLBACK2-LABEL: shl_32bytes:
+; FALLBACK2:       # %bb.0:
+; FALLBACK2-NEXT:    movq (%rdi), %rcx
+; FALLBACK2-NEXT:    movq 8(%rdi), %r8
+; FALLBACK2-NEXT:    movq 16(%rdi), %r9
+; FALLBACK2-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK2-NEXT:    movzbl (%rsi), %esi
+; FALLBACK2-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK2-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    andb $24, %sil
+; FALLBACK2-NEXT:    negb %sil
+; FALLBACK2-NEXT:    movsbq %sil, %rsi
+; FALLBACK2-NEXT:    movq -40(%rsp,%rsi), %rdi
+; FALLBACK2-NEXT:    movq -32(%rsp,%rsi), %rcx
+; FALLBACK2-NEXT:    shlxq %rax, %rcx, %r8
+; FALLBACK2-NEXT:    shlxq %rax, -16(%rsp,%rsi), %r9
+; FALLBACK2-NEXT:    movq -24(%rsp,%rsi), %rsi
+; FALLBACK2-NEXT:    shlxq %rax, %rsi, %r10
+; FALLBACK2-NEXT:    shlxq %rax, %rdi, %r11
+; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    shrq %rdi
+; FALLBACK2-NEXT:    shrxq %rax, %rdi, %rdi
+; FALLBACK2-NEXT:    orq %r8, %rdi
+; FALLBACK2-NEXT:    shrq %rsi
+; FALLBACK2-NEXT:    shrxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT:    orq %r9, %rsi
+; FALLBACK2-NEXT:    shrq %rcx
+; FALLBACK2-NEXT:    shrxq %rax, %rcx, %rax
+; FALLBACK2-NEXT:    orq %r10, %rax
+; FALLBACK2-NEXT:    movq %r11, (%rdx)
+; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
+; FALLBACK2-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT:    retq
+;
+; FALLBACK3-LABEL: shl_32bytes:
+; FALLBACK3:       # %bb.0:
+; FALLBACK3-NEXT:    movq (%rdi), %rax
+; FALLBACK3-NEXT:    movq 8(%rdi), %r8
+; FALLBACK3-NEXT:    movq 16(%rdi), %r9
+; FALLBACK3-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK3-NEXT:    movzbl (%rsi), %esi
+; FALLBACK3-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK3-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    andb $24, %sil
+; FALLBACK3-NEXT:    negb %sil
+; FALLBACK3-NEXT:    movsbq %sil, %rax
+; FALLBACK3-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK3-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK3-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK3-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK3-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK3-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK3-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK3-NEXT:    shlxq %rcx, %r8, %rcx
+; FALLBACK3-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK3-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK3-NEXT:    movq %rcx, (%rdx)
+; FALLBACK3-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK3-NEXT:    retq
+;
+; FALLBACK4-LABEL: shl_32bytes:
+; FALLBACK4:       # %bb.0:
+; FALLBACK4-NEXT:    movups (%rdi), %xmm0
+; FALLBACK4-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK4-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK4-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    andb $24, %cl
+; FALLBACK4-NEXT:    negb %cl
+; FALLBACK4-NEXT:    movsbq %cl, %r8
+; FALLBACK4-NEXT:    movq -16(%rsp,%r8), %r9
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r9
+; FALLBACK4-NEXT:    movl %eax, %esi
+; FALLBACK4-NEXT:    notb %sil
+; FALLBACK4-NEXT:    movq -24(%rsp,%r8), %r10
+; FALLBACK4-NEXT:    movq %r10, %rdi
+; FALLBACK4-NEXT:    shrq %rdi
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %rdi
+; FALLBACK4-NEXT:    orq %r9, %rdi
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r10
+; FALLBACK4-NEXT:    movq -40(%rsp,%r8), %r9
+; FALLBACK4-NEXT:    movq -32(%rsp,%r8), %r8
+; FALLBACK4-NEXT:    movq %r8, %r11
+; FALLBACK4-NEXT:    shrq %r11
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r11
+; FALLBACK4-NEXT:    orq %r10, %r11
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r8
+; FALLBACK4-NEXT:    movq %r9, %r10
+; FALLBACK4-NEXT:    shrq %r10
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r10
+; FALLBACK4-NEXT:    orq %r8, %r10
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r9
+; FALLBACK4-NEXT:    movq %r9, (%rdx)
+; FALLBACK4-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK4-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK4-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK4-NEXT:    retq
+;
+; FALLBACK5-LABEL: shl_32bytes:
+; FALLBACK5:       # %bb.0:
+; FALLBACK5-NEXT:    movups (%rdi), %xmm0
+; FALLBACK5-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT:    movzbl (%rsi), %eax
+; FALLBACK5-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK5-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    andb $24, %al
+; FALLBACK5-NEXT:    negb %al
+; FALLBACK5-NEXT:    movsbq %al, %rax
+; FALLBACK5-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK5-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK5-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK5-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK5-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK5-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK5-NEXT:    movq %r8, %r9
+; FALLBACK5-NEXT:    shlq %cl, %r9
+; FALLBACK5-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK5-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK5-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK5-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK5-NEXT:    movq %r9, (%rdx)
+; FALLBACK5-NEXT:    retq
+;
+; FALLBACK6-LABEL: shl_32bytes:
+; FALLBACK6:       # %bb.0:
+; FALLBACK6-NEXT:    movups (%rdi), %xmm0
+; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK6-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK6-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    andb $24, %cl
+; FALLBACK6-NEXT:    negb %cl
+; FALLBACK6-NEXT:    movsbq %cl, %rcx
+; FALLBACK6-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK6-NEXT:    movq -24(%rsp,%rcx), %rdi
+; FALLBACK6-NEXT:    shlxq %rax, %rdi, %r8
+; FALLBACK6-NEXT:    movq -40(%rsp,%rcx), %r9
+; FALLBACK6-NEXT:    movq -32(%rsp,%rcx), %rcx
+; FALLBACK6-NEXT:    shlxq %rax, %rcx, %r10
+; FALLBACK6-NEXT:    shlxq %rax, %r9, %r11
+; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT:    notb %al
+; FALLBACK6-NEXT:    shrq %rdi
+; FALLBACK6-NEXT:    shrxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT:    orq %rsi, %rdi
+; FALLBACK6-NEXT:    shrq %rcx
+; FALLBACK6-NEXT:    shrxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT:    orq %r8, %rcx
+; FALLBACK6-NEXT:    shrq %r9
+; FALLBACK6-NEXT:    shrxq %rax, %r9, %rax
+; FALLBACK6-NEXT:    orq %r10, %rax
+; FALLBACK6-NEXT:    movq %r11, (%rdx)
+; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK6-NEXT:    retq
+;
+; FALLBACK7-LABEL: shl_32bytes:
+; FALLBACK7:       # %bb.0:
+; FALLBACK7-NEXT:    movups (%rdi), %xmm0
+; FALLBACK7-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT:    movzbl (%rsi), %eax
+; FALLBACK7-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK7-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    andb $24, %al
+; FALLBACK7-NEXT:    negb %al
+; FALLBACK7-NEXT:    movsbq %al, %rax
+; FALLBACK7-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK7-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK7-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK7-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK7-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK7-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK7-NEXT:    shlxq %rcx, %r8, %r9
+; FALLBACK7-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK7-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK7-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK7-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK7-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK7-NEXT:    movq %r9, (%rdx)
+; FALLBACK7-NEXT:    retq
+;
+; FALLBACK8-LABEL: shl_32bytes:
+; FALLBACK8:       # %bb.0:
+; FALLBACK8-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK8-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK8-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK8-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    andb $24, %cl
+; FALLBACK8-NEXT:    negb %cl
+; FALLBACK8-NEXT:    movsbq %cl, %r8
+; FALLBACK8-NEXT:    movq -16(%rsp,%r8), %r9
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r9
+; FALLBACK8-NEXT:    movl %eax, %esi
+; FALLBACK8-NEXT:    notb %sil
+; FALLBACK8-NEXT:    movq -24(%rsp,%r8), %r10
+; FALLBACK8-NEXT:    movq %r10, %rdi
+; FALLBACK8-NEXT:    shrq %rdi
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %rdi
+; FALLBACK8-NEXT:    orq %r9, %rdi
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r10
+; FALLBACK8-NEXT:    movq -40(%rsp,%r8), %r9
+; FALLBACK8-NEXT:    movq -32(%rsp,%r8), %r8
+; FALLBACK8-NEXT:    movq %r8, %r11
+; FALLBACK8-NEXT:    shrq %r11
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r11
+; FALLBACK8-NEXT:    orq %r10, %r11
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r8
+; FALLBACK8-NEXT:    movq %r9, %r10
+; FALLBACK8-NEXT:    shrq %r10
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r10
+; FALLBACK8-NEXT:    orq %r8, %r10
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r9
+; FALLBACK8-NEXT:    movq %r9, (%rdx)
+; FALLBACK8-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK8-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK8-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK8-NEXT:    vzeroupper
+; FALLBACK8-NEXT:    retq
+;
+; FALLBACK9-LABEL: shl_32bytes:
+; FALLBACK9:       # %bb.0:
+; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT:    movzbl (%rsi), %eax
+; FALLBACK9-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK9-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK9-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    andb $24, %al
+; FALLBACK9-NEXT:    negb %al
+; FALLBACK9-NEXT:    movsbq %al, %rax
+; FALLBACK9-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK9-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK9-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK9-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK9-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK9-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK9-NEXT:    movq %r8, %r9
+; FALLBACK9-NEXT:    shlq %cl, %r9
+; FALLBACK9-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK9-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK9-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK9-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK9-NEXT:    movq %r9, (%rdx)
+; FALLBACK9-NEXT:    vzeroupper
+; FALLBACK9-NEXT:    retq
+;
+; FALLBACK10-LABEL: shl_32bytes:
+; FALLBACK10:       # %bb.0:
+; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK10-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK10-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    andb $24, %cl
+; FALLBACK10-NEXT:    negb %cl
+; FALLBACK10-NEXT:    movsbq %cl, %rcx
+; FALLBACK10-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK10-NEXT:    movq -24(%rsp,%rcx), %rdi
+; FALLBACK10-NEXT:    shlxq %rax, %rdi, %r8
+; FALLBACK10-NEXT:    movq -40(%rsp,%rcx), %r9
+; FALLBACK10-NEXT:    movq -32(%rsp,%rcx), %rcx
+; FALLBACK10-NEXT:    shlxq %rax, %rcx, %r10
+; FALLBACK10-NEXT:    shlxq %rax, %r9, %r11
+; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT:    notb %al
+; FALLBACK10-NEXT:    shrq %rdi
+; FALLBACK10-NEXT:    shrxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT:    orq %rsi, %rdi
+; FALLBACK10-NEXT:    shrq %rcx
+; FALLBACK10-NEXT:    shrxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT:    orq %r8, %rcx
+; FALLBACK10-NEXT:    shrq %r9
+; FALLBACK10-NEXT:    shrxq %rax, %r9, %rax
+; FALLBACK10-NEXT:    orq %r10, %rax
+; FALLBACK10-NEXT:    movq %r11, (%rdx)
+; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK10-NEXT:    vzeroupper
+; FALLBACK10-NEXT:    retq
+;
+; FALLBACK11-LABEL: shl_32bytes:
+; FALLBACK11:       # %bb.0:
+; FALLBACK11-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT:    movzbl (%rsi), %eax
+; FALLBACK11-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK11-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK11-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    andb $24, %al
+; FALLBACK11-NEXT:    negb %al
+; FALLBACK11-NEXT:    movsbq %al, %rax
+; FALLBACK11-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK11-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK11-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK11-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK11-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK11-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK11-NEXT:    shlxq %rcx, %r8, %r9
+; FALLBACK11-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK11-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK11-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK11-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK11-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK11-NEXT:    movq %r9, (%rdx)
+; FALLBACK11-NEXT:    vzeroupper
+; FALLBACK11-NEXT:    retq
+;
+; FALLBACK12-LABEL: shl_32bytes:
+; FALLBACK12:       # %bb.0:
+; FALLBACK12-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK12-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK12-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK12-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    andb $24, %cl
+; FALLBACK12-NEXT:    negb %cl
+; FALLBACK12-NEXT:    movsbq %cl, %r8
+; FALLBACK12-NEXT:    movq -16(%rsp,%r8), %r9
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r9
+; FALLBACK12-NEXT:    movl %eax, %esi
+; FALLBACK12-NEXT:    notb %sil
+; FALLBACK12-NEXT:    movq -24(%rsp,%r8), %r10
+; FALLBACK12-NEXT:    movq %r10, %rdi
+; FALLBACK12-NEXT:    shrq %rdi
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %rdi
+; FALLBACK12-NEXT:    orq %r9, %rdi
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r10
+; FALLBACK12-NEXT:    movq -40(%rsp,%r8), %r9
+; FALLBACK12-NEXT:    movq -32(%rsp,%r8), %r8
+; FALLBACK12-NEXT:    movq %r8, %r11
+; FALLBACK12-NEXT:    shrq %r11
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r11
+; FALLBACK12-NEXT:    orq %r10, %r11
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r8
+; FALLBACK12-NEXT:    movq %r9, %r10
+; FALLBACK12-NEXT:    shrq %r10
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r10
+; FALLBACK12-NEXT:    orq %r8, %r10
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r9
+; FALLBACK12-NEXT:    movq %r9, (%rdx)
+; FALLBACK12-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK12-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK12-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK12-NEXT:    vzeroupper
+; FALLBACK12-NEXT:    retq
+;
+; FALLBACK13-LABEL: shl_32bytes:
+; FALLBACK13:       # %bb.0:
+; FALLBACK13-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK13-NEXT:    movzbl (%rsi), %eax
+; FALLBACK13-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK13-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    andb $24, %al
+; FALLBACK13-NEXT:    negb %al
+; FALLBACK13-NEXT:    movsbq %al, %rax
+; FALLBACK13-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK13-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK13-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK13-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK13-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK13-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK13-NEXT:    movq %r8, %r9
+; FALLBACK13-NEXT:    shlq %cl, %r9
+; FALLBACK13-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK13-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK13-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK13-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK13-NEXT:    movq %r9, (%rdx)
+; FALLBACK13-NEXT:    vzeroupper
+; FALLBACK13-NEXT:    retq
+;
+; FALLBACK14-LABEL: shl_32bytes:
+; FALLBACK14:       # %bb.0:
+; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK14-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK14-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    andb $24, %cl
+; FALLBACK14-NEXT:    negb %cl
+; FALLBACK14-NEXT:    movsbq %cl, %rcx
+; FALLBACK14-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK14-NEXT:    movq -24(%rsp,%rcx), %rdi
+; FALLBACK14-NEXT:    shlxq %rax, %rdi, %r8
+; FALLBACK14-NEXT:    movq -40(%rsp,%rcx), %r9
+; FALLBACK14-NEXT:    movq -32(%rsp,%rcx), %rcx
+; FALLBACK14-NEXT:    shlxq %rax, %rcx, %r10
+; FALLBACK14-NEXT:    shlxq %rax, %r9, %r11
+; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT:    notb %al
+; FALLBACK14-NEXT:    shrq %rdi
+; FALLBACK14-NEXT:    shrxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT:    orq %rsi, %rdi
+; FALLBACK14-NEXT:    shrq %rcx
+; FALLBACK14-NEXT:    shrxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT:    orq %r8, %rcx
+; FALLBACK14-NEXT:    shrq %r9
+; FALLBACK14-NEXT:    shrxq %rax, %r9, %rax
+; FALLBACK14-NEXT:    orq %r10, %rax
+; FALLBACK14-NEXT:    movq %r11, (%rdx)
+; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK14-NEXT:    vzeroupper
+; FALLBACK14-NEXT:    retq
+;
+; FALLBACK15-LABEL: shl_32bytes:
+; FALLBACK15:       # %bb.0:
+; FALLBACK15-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK15-NEXT:    movzbl (%rsi), %eax
+; FALLBACK15-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK15-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    andb $24, %al
+; FALLBACK15-NEXT:    negb %al
+; FALLBACK15-NEXT:    movsbq %al, %rax
+; FALLBACK15-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK15-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK15-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK15-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK15-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK15-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK15-NEXT:    shlxq %rcx, %r8, %r9
+; FALLBACK15-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK15-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK15-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK15-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK15-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK15-NEXT:    movq %r9, (%rdx)
+; FALLBACK15-NEXT:    vzeroupper
+; FALLBACK15-NEXT:    retq
+;
+; FALLBACK16-LABEL: shl_32bytes:
+; FALLBACK16:       # %bb.0:
+; FALLBACK16-NEXT:    pushl %ebp
+; FALLBACK16-NEXT:    pushl %ebx
+; FALLBACK16-NEXT:    pushl %edi
+; FALLBACK16-NEXT:    pushl %esi
+; FALLBACK16-NEXT:    subl $108, %esp
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT:    movl (%ecx), %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 4(%ecx), %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 8(%ecx), %esi
+; FALLBACK16-NEXT:    movl 12(%ecx), %edi
+; FALLBACK16-NEXT:    movl 16(%ecx), %ebx
+; FALLBACK16-NEXT:    movb (%eax), %ah
+; FALLBACK16-NEXT:    movl 20(%ecx), %ebp
+; FALLBACK16-NEXT:    movl 24(%ecx), %edx
+; FALLBACK16-NEXT:    movl 28(%ecx), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movb %ah, %ch
+; FALLBACK16-NEXT:    shlb $3, %ch
+; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    andb $28, %ah
+; FALLBACK16-NEXT:    negb %ah
+; FALLBACK16-NEXT:    movsbl %ah, %ebx
+; FALLBACK16-NEXT:    movl 64(%esp,%ebx), %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 68(%esp,%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, %esi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    movb %ch, %dl
+; FALLBACK16-NEXT:    notb %dl
+; FALLBACK16-NEXT:    shrl %edi
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    orl %esi, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 76(%esp,%ebx), %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    movl 72(%esp,%ebx), %esi
+; FALLBACK16-NEXT:    movl %esi, %ebp
+; FALLBACK16-NEXT:    shrl %ebp
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    orl %edi, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    shrl %eax
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    orl %esi, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 84(%esp,%ebx), %esi
+; FALLBACK16-NEXT:    movl %esi, %eax
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    movl 80(%esp,%ebx), %edi
+; FALLBACK16-NEXT:    movl %edi, %ebp
+; FALLBACK16-NEXT:    shrl %ebp
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %eax
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    orl %edi, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 92(%esp,%ebx), %eax
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    movl 88(%esp,%ebx), %edi
+; FALLBACK16-NEXT:    movl %edi, %ebx
+; FALLBACK16-NEXT:    shrl %ebx
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    orl %eax, %ebx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    shrl %esi
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    orl %edi, %esi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl %edx, (%eax)
+; FALLBACK16-NEXT:    movl %esi, 24(%eax)
+; FALLBACK16-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK16-NEXT:    movl %ebp, 20(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK16-NEXT:    addl $108, %esp
+; FALLBACK16-NEXT:    popl %esi
+; FALLBACK16-NEXT:    popl %edi
+; FALLBACK16-NEXT:    popl %ebx
+; FALLBACK16-NEXT:    popl %ebp
+; FALLBACK16-NEXT:    retl
+;
+; FALLBACK17-LABEL: shl_32bytes:
+; FALLBACK17:       # %bb.0:
+; FALLBACK17-NEXT:    pushl %ebp
+; FALLBACK17-NEXT:    pushl %ebx
+; FALLBACK17-NEXT:    pushl %edi
+; FALLBACK17-NEXT:    pushl %esi
+; FALLBACK17-NEXT:    subl $92, %esp
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT:    movl (%eax), %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 4(%eax), %edx
+; FALLBACK17-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 8(%eax), %esi
+; FALLBACK17-NEXT:    movl 12(%eax), %edi
+; FALLBACK17-NEXT:    movl 16(%eax), %ebx
+; FALLBACK17-NEXT:    movb (%ecx), %ch
+; FALLBACK17-NEXT:    movl 20(%eax), %ebp
+; FALLBACK17-NEXT:    movl 24(%eax), %edx
+; FALLBACK17-NEXT:    movl 28(%eax), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movb %ch, %cl
+; FALLBACK17-NEXT:    shlb $3, %cl
+; FALLBACK17-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    andb $28, %ch
+; FALLBACK17-NEXT:    negb %ch
+; FALLBACK17-NEXT:    movsbl %ch, %eax
+; FALLBACK17-NEXT:    movl 56(%esp,%eax), %edx
+; FALLBACK17-NEXT:    movl 60(%esp,%eax), %ebx
+; FALLBACK17-NEXT:    movl %ebx, %esi
+; FALLBACK17-NEXT:    shldl %cl, %edx, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 52(%esp,%eax), %esi
+; FALLBACK17-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 64(%esp,%eax), %edi
+; FALLBACK17-NEXT:    movl 68(%esp,%eax), %ebp
+; FALLBACK17-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK17-NEXT:    shldl %cl, %ebx, %edi
+; FALLBACK17-NEXT:    movl 48(%esp,%eax), %ebx
+; FALLBACK17-NEXT:    movl 72(%esp,%eax), %edx
+; FALLBACK17-NEXT:    movl 76(%esp,%eax), %esi
+; FALLBACK17-NEXT:    shldl %cl, %edx, %esi
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    shldl %cl, %eax, %edx
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT:    movl %edx, 24(%eax)
+; FALLBACK17-NEXT:    movl %esi, 28(%eax)
+; FALLBACK17-NEXT:    movl %edi, 16(%eax)
+; FALLBACK17-NEXT:    movl %ebp, 20(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, 8(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, 12(%eax)
+; FALLBACK17-NEXT:    movl (%esp), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    shldl %cl, %ebx, %edx
+; FALLBACK17-NEXT:    shll %cl, %ebx
+; FALLBACK17-NEXT:    movl %ebx, (%eax)
+; FALLBACK17-NEXT:    movl %edx, 4(%eax)
+; FALLBACK17-NEXT:    addl $92, %esp
+; FALLBACK17-NEXT:    popl %esi
+; FALLBACK17-NEXT:    popl %edi
+; FALLBACK17-NEXT:    popl %ebx
+; FALLBACK17-NEXT:    popl %ebp
+; FALLBACK17-NEXT:    retl
+;
+; FALLBACK18-LABEL: shl_32bytes:
+; FALLBACK18:       # %bb.0:
+; FALLBACK18-NEXT:    pushl %ebp
+; FALLBACK18-NEXT:    pushl %ebx
+; FALLBACK18-NEXT:    pushl %edi
+; FALLBACK18-NEXT:    pushl %esi
+; FALLBACK18-NEXT:    subl $108, %esp
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl (%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 4(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 8(%eax), %esi
+; FALLBACK18-NEXT:    movl 12(%eax), %edi
+; FALLBACK18-NEXT:    movl 16(%eax), %ebp
+; FALLBACK18-NEXT:    movzbl (%ebx), %ebx
+; FALLBACK18-NEXT:    movl 20(%eax), %edx
+; FALLBACK18-NEXT:    movl 24(%eax), %ecx
+; FALLBACK18-NEXT:    movl 28(%eax), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebx, %edx
+; FALLBACK18-NEXT:    shlb $3, %dl
+; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    andb $28, %bl
+; FALLBACK18-NEXT:    negb %bl
+; FALLBACK18-NEXT:    movsbl %bl, %esi
+; FALLBACK18-NEXT:    movl 64(%esp,%esi), %ebx
+; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 68(%esp,%esi), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edx, %eax, %edi
+; FALLBACK18-NEXT:    movl %edx, %ecx
+; FALLBACK18-NEXT:    notb %cl
+; FALLBACK18-NEXT:    shrl %ebx
+; FALLBACK18-NEXT:    shrxl %ecx, %ebx, %ebx
+; FALLBACK18-NEXT:    orl %edi, %ebx
+; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 72(%esp,%esi), %ebx
+; FALLBACK18-NEXT:    movl %ebx, %edi
+; FALLBACK18-NEXT:    shrl %edi
+; FALLBACK18-NEXT:    shrxl %ecx, %edi, %eax
+; FALLBACK18-NEXT:    movl 76(%esp,%esi), %edi
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edx, %ebx, %ebx
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    shrl %eax
+; FALLBACK18-NEXT:    shrxl %ecx, %eax, %eax
+; FALLBACK18-NEXT:    orl %ebx, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 80(%esp,%esi), %ebx
+; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrl %ebx
+; FALLBACK18-NEXT:    shrxl %ecx, %ebx, %eax
+; FALLBACK18-NEXT:    movl 84(%esp,%esi), %ebx
+; FALLBACK18-NEXT:    shlxl %edx, %ebx, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrl %edi
+; FALLBACK18-NEXT:    shrxl %ecx, %edi, %edi
+; FALLBACK18-NEXT:    orl %eax, %edi
+; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edx, 92(%esp,%esi), %ebp
+; FALLBACK18-NEXT:    movl 88(%esp,%esi), %esi
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK18-NEXT:    shrl %esi
+; FALLBACK18-NEXT:    shrxl %ecx, %esi, %esi
+; FALLBACK18-NEXT:    orl %ebp, %esi
+; FALLBACK18-NEXT:    shrl %ebx
+; FALLBACK18-NEXT:    shrxl %ecx, %ebx, %edx
+; FALLBACK18-NEXT:    orl %eax, %edx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, (%eax)
+; FALLBACK18-NEXT:    movl %edx, 24(%eax)
+; FALLBACK18-NEXT:    movl %esi, 28(%eax)
+; FALLBACK18-NEXT:    movl %edi, 16(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK18-NEXT:    addl $108, %esp
+; FALLBACK18-NEXT:    popl %esi
+; FALLBACK18-NEXT:    popl %edi
+; FALLBACK18-NEXT:    popl %ebx
+; FALLBACK18-NEXT:    popl %ebp
+; FALLBACK18-NEXT:    retl
+;
+; FALLBACK19-LABEL: shl_32bytes:
+; FALLBACK19:       # %bb.0:
+; FALLBACK19-NEXT:    pushl %ebp
+; FALLBACK19-NEXT:    pushl %ebx
+; FALLBACK19-NEXT:    pushl %edi
+; FALLBACK19-NEXT:    pushl %esi
+; FALLBACK19-NEXT:    subl $92, %esp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT:    movl (%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 4(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 8(%ecx), %esi
+; FALLBACK19-NEXT:    movl 12(%ecx), %edi
+; FALLBACK19-NEXT:    movl 16(%ecx), %ebp
+; FALLBACK19-NEXT:    movzbl (%ebx), %ebx
+; FALLBACK19-NEXT:    movl 20(%ecx), %edx
+; FALLBACK19-NEXT:    movl 24(%ecx), %eax
+; FALLBACK19-NEXT:    movl 28(%ecx), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebx, %ecx
+; FALLBACK19-NEXT:    shlb $3, %cl
+; FALLBACK19-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    andb $28, %bl
+; FALLBACK19-NEXT:    negb %bl
+; FALLBACK19-NEXT:    movsbl %bl, %eax
+; FALLBACK19-NEXT:    movl 56(%esp,%eax), %edx
+; FALLBACK19-NEXT:    movl 60(%esp,%eax), %esi
+; FALLBACK19-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT:    shldl %cl, %edx, %esi
+; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 52(%esp,%eax), %ebx
+; FALLBACK19-NEXT:    shldl %cl, %ebx, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 64(%esp,%eax), %edi
+; FALLBACK19-NEXT:    movl 68(%esp,%eax), %ebp
+; FALLBACK19-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK19-NEXT:    movl (%esp), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK19-NEXT:    movl 48(%esp,%eax), %edx
+; FALLBACK19-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 72(%esp,%eax), %edx
+; FALLBACK19-NEXT:    movl 76(%esp,%eax), %esi
+; FALLBACK19-NEXT:    shldl %cl, %edx, %esi
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    shldl %cl, %eax, %edx
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT:    movl %edx, 24(%eax)
+; FALLBACK19-NEXT:    movl %esi, 28(%eax)
+; FALLBACK19-NEXT:    movl %edi, 16(%eax)
+; FALLBACK19-NEXT:    movl %ebp, 20(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, 8(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, 12(%eax)
+; FALLBACK19-NEXT:    movl (%esp), %esi # 4-byte Reload
+; FALLBACK19-NEXT:    shlxl %ecx, %esi, %edx
+; FALLBACK19-NEXT:    movl %edx, (%eax)
+; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT:    shldl %cl, %esi, %ebx
+; FALLBACK19-NEXT:    movl %ebx, 4(%eax)
+; FALLBACK19-NEXT:    addl $92, %esp
+; FALLBACK19-NEXT:    popl %esi
+; FALLBACK19-NEXT:    popl %edi
+; FALLBACK19-NEXT:    popl %ebx
+; FALLBACK19-NEXT:    popl %ebp
+; FALLBACK19-NEXT:    retl
+;
+; FALLBACK20-LABEL: shl_32bytes:
+; FALLBACK20:       # %bb.0:
+; FALLBACK20-NEXT:    pushl %ebp
+; FALLBACK20-NEXT:    pushl %ebx
+; FALLBACK20-NEXT:    pushl %edi
+; FALLBACK20-NEXT:    pushl %esi
+; FALLBACK20-NEXT:    subl $108, %esp
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    movups (%ecx), %xmm0
+; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT:    movzbl (%eax), %ecx
+; FALLBACK20-NEXT:    movb %cl, %dh
+; FALLBACK20-NEXT:    shlb $3, %dh
+; FALLBACK20-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    andb $28, %cl
+; FALLBACK20-NEXT:    negb %cl
+; FALLBACK20-NEXT:    movsbl %cl, %eax
+; FALLBACK20-NEXT:    movl 84(%esp,%eax), %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    movb %dh, %dl
+; FALLBACK20-NEXT:    notb %dl
+; FALLBACK20-NEXT:    movl 80(%esp,%eax), %esi
+; FALLBACK20-NEXT:    movl %eax, %ebx
+; FALLBACK20-NEXT:    movl %esi, %eax
+; FALLBACK20-NEXT:    shrl %eax
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    orl %edi, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    movl %ebx, %edi
+; FALLBACK20-NEXT:    movl 76(%esp,%ebx), %ebp
+; FALLBACK20-NEXT:    movl %ebp, %eax
+; FALLBACK20-NEXT:    shrl %eax
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    orl %esi, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    movl 72(%esp,%ebx), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %eax
+; FALLBACK20-NEXT:    shrl %eax
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    orl %ebp, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 68(%esp,%edi), %ebp
+; FALLBACK20-NEXT:    movl %ebp, %esi
+; FALLBACK20-NEXT:    shrl %esi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    orl %ebx, %esi
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    movl 64(%esp,%edi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    shrl %ebx
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    orl %ebp, %ebx
+; FALLBACK20-NEXT:    movl 88(%esp,%edi), %ebp
+; FALLBACK20-NEXT:    movl %ebp, %edi
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %eax
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    orl %edi, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl 92(%esp,%eax), %edi
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    shrl %ebp
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    orl %edi, %ebp
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl %edx, (%eax)
+; FALLBACK20-NEXT:    movl %ebp, 28(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK20-NEXT:    movl %ebx, 4(%eax)
+; FALLBACK20-NEXT:    movl %esi, 8(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK20-NEXT:    addl $108, %esp
+; FALLBACK20-NEXT:    popl %esi
+; FALLBACK20-NEXT:    popl %edi
+; FALLBACK20-NEXT:    popl %ebx
+; FALLBACK20-NEXT:    popl %ebp
+; FALLBACK20-NEXT:    retl
+;
+; FALLBACK21-LABEL: shl_32bytes:
+; FALLBACK21:       # %bb.0:
+; FALLBACK21-NEXT:    pushl %ebp
+; FALLBACK21-NEXT:    pushl %ebx
+; FALLBACK21-NEXT:    pushl %edi
+; FALLBACK21-NEXT:    pushl %esi
+; FALLBACK21-NEXT:    subl $92, %esp
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT:    movups (%ecx), %xmm0
+; FALLBACK21-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK21-NEXT:    movzbl (%eax), %eax
+; FALLBACK21-NEXT:    movl %eax, %ecx
+; FALLBACK21-NEXT:    shlb $3, %cl
+; FALLBACK21-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    andb $28, %al
+; FALLBACK21-NEXT:    negb %al
+; FALLBACK21-NEXT:    movsbl %al, %ebp
+; FALLBACK21-NEXT:    movl 64(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl 68(%esp,%ebp), %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shldl %cl, %eax, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 60(%esp,%ebp), %edx
+; FALLBACK21-NEXT:    shldl %cl, %edx, %eax
+; FALLBACK21-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 56(%esp,%ebp), %edi
+; FALLBACK21-NEXT:    shldl %cl, %edi, %edx
+; FALLBACK21-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 52(%esp,%ebp), %ebx
+; FALLBACK21-NEXT:    shldl %cl, %ebx, %edi
+; FALLBACK21-NEXT:    movl 72(%esp,%ebp), %edx
+; FALLBACK21-NEXT:    movl %edx, %eax
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK21-NEXT:    shldl %cl, %esi, %eax
+; FALLBACK21-NEXT:    movl 48(%esp,%ebp), %esi
+; FALLBACK21-NEXT:    movl 76(%esp,%ebp), %ebp
+; FALLBACK21-NEXT:    shldl %cl, %edx, %ebp
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK21-NEXT:    movl %ebp, 28(%edx)
+; FALLBACK21-NEXT:    movl %eax, 24(%edx)
+; FALLBACK21-NEXT:    movl %esi, %eax
+; FALLBACK21-NEXT:    shll %cl, %eax
+; FALLBACK21-NEXT:    shldl %cl, %esi, %ebx
+; FALLBACK21-NEXT:    movl %ebx, 4(%edx)
+; FALLBACK21-NEXT:    movl %edi, 8(%edx)
+; FALLBACK21-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 12(%edx)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 16(%edx)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 20(%edx)
+; FALLBACK21-NEXT:    movl %eax, (%edx)
+; FALLBACK21-NEXT:    addl $92, %esp
+; FALLBACK21-NEXT:    popl %esi
+; FALLBACK21-NEXT:    popl %edi
+; FALLBACK21-NEXT:    popl %ebx
+; FALLBACK21-NEXT:    popl %ebp
+; FALLBACK21-NEXT:    retl
+;
+; FALLBACK22-LABEL: shl_32bytes:
+; FALLBACK22:       # %bb.0:
+; FALLBACK22-NEXT:    pushl %ebp
+; FALLBACK22-NEXT:    pushl %ebx
+; FALLBACK22-NEXT:    pushl %edi
+; FALLBACK22-NEXT:    pushl %esi
+; FALLBACK22-NEXT:    subl $108, %esp
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT:    movups (%ecx), %xmm0
+; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT:    movzbl (%eax), %ecx
+; FALLBACK22-NEXT:    movl %ecx, %eax
+; FALLBACK22-NEXT:    shlb $3, %al
+; FALLBACK22-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    andb $28, %cl
+; FALLBACK22-NEXT:    negb %cl
+; FALLBACK22-NEXT:    movsbl %cl, %edx
+; FALLBACK22-NEXT:    movl 84(%esp,%edx), %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %eax, %ecx, %ecx
+; FALLBACK22-NEXT:    movl 80(%esp,%edx), %esi
+; FALLBACK22-NEXT:    shlxl %eax, %esi, %edi
+; FALLBACK22-NEXT:    movl %eax, %ebx
+; FALLBACK22-NEXT:    notb %bl
+; FALLBACK22-NEXT:    shrl %esi
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK22-NEXT:    orl %ecx, %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 76(%esp,%edx), %ecx
+; FALLBACK22-NEXT:    movl %ecx, %esi
+; FALLBACK22-NEXT:    shrl %esi
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK22-NEXT:    orl %edi, %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %eax, %ecx, %ecx
+; FALLBACK22-NEXT:    movl 72(%esp,%edx), %esi
+; FALLBACK22-NEXT:    movl %esi, %edi
+; FALLBACK22-NEXT:    shrl %edi
+; FALLBACK22-NEXT:    shrxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    orl %ecx, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %eax, %esi, %ecx
+; FALLBACK22-NEXT:    movl 68(%esp,%edx), %esi
+; FALLBACK22-NEXT:    movl %esi, %edi
+; FALLBACK22-NEXT:    shrl %edi
+; FALLBACK22-NEXT:    shrxl %ebx, %edi, %ebp
+; FALLBACK22-NEXT:    orl %ecx, %ebp
+; FALLBACK22-NEXT:    shlxl %eax, %esi, %edi
+; FALLBACK22-NEXT:    movl 64(%esp,%edx), %esi
+; FALLBACK22-NEXT:    movl %esi, %ecx
+; FALLBACK22-NEXT:    shrl %ecx
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT:    orl %edi, %ecx
+; FALLBACK22-NEXT:    shlxl %eax, %esi, %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %eax, 92(%esp,%edx), %edi
+; FALLBACK22-NEXT:    movl 88(%esp,%edx), %edx
+; FALLBACK22-NEXT:    shlxl %eax, %edx, %esi
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    shrl %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT:    orl %esi, %eax
+; FALLBACK22-NEXT:    shrl %edx
+; FALLBACK22-NEXT:    shrxl %ebx, %edx, %edx
+; FALLBACK22-NEXT:    orl %edi, %edx
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK22-NEXT:    movl %edi, (%esi)
+; FALLBACK22-NEXT:    movl %edx, 28(%esi)
+; FALLBACK22-NEXT:    movl %eax, 24(%esi)
+; FALLBACK22-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK22-NEXT:    movl %ebp, 8(%esi)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 12(%esi)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 16(%esi)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 20(%esi)
+; FALLBACK22-NEXT:    addl $108, %esp
+; FALLBACK22-NEXT:    popl %esi
+; FALLBACK22-NEXT:    popl %edi
+; FALLBACK22-NEXT:    popl %ebx
+; FALLBACK22-NEXT:    popl %ebp
+; FALLBACK22-NEXT:    retl
+;
+; FALLBACK23-LABEL: shl_32bytes:
+; FALLBACK23:       # %bb.0:
+; FALLBACK23-NEXT:    pushl %ebp
+; FALLBACK23-NEXT:    pushl %ebx
+; FALLBACK23-NEXT:    pushl %edi
+; FALLBACK23-NEXT:    pushl %esi
+; FALLBACK23-NEXT:    subl $92, %esp
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT:    movups (%ecx), %xmm0
+; FALLBACK23-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK23-NEXT:    movzbl (%eax), %eax
+; FALLBACK23-NEXT:    movl %eax, %ecx
+; FALLBACK23-NEXT:    shlb $3, %cl
+; FALLBACK23-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    andb $28, %al
+; FALLBACK23-NEXT:    negb %al
+; FALLBACK23-NEXT:    movsbl %al, %ebx
+; FALLBACK23-NEXT:    movl 64(%esp,%ebx), %eax
+; FALLBACK23-NEXT:    movl 68(%esp,%ebx), %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shldl %cl, %eax, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 60(%esp,%ebx), %edx
+; FALLBACK23-NEXT:    shldl %cl, %edx, %eax
+; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 56(%esp,%ebx), %edi
+; FALLBACK23-NEXT:    shldl %cl, %edi, %edx
+; FALLBACK23-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 52(%esp,%ebx), %ebp
+; FALLBACK23-NEXT:    shldl %cl, %ebp, %edi
+; FALLBACK23-NEXT:    movl 72(%esp,%ebx), %edx
+; FALLBACK23-NEXT:    movl %edx, %eax
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT:    shldl %cl, %esi, %eax
+; FALLBACK23-NEXT:    movl 48(%esp,%ebx), %esi
+; FALLBACK23-NEXT:    movl 76(%esp,%ebx), %ebx
+; FALLBACK23-NEXT:    shldl %cl, %edx, %ebx
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK23-NEXT:    movl %ebx, 28(%edx)
+; FALLBACK23-NEXT:    movl %eax, 24(%edx)
+; FALLBACK23-NEXT:    shlxl %ecx, %esi, %eax
+; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT:    shldl %cl, %esi, %ebp
+; FALLBACK23-NEXT:    movl %ebp, 4(%edx)
+; FALLBACK23-NEXT:    movl %edi, 8(%edx)
+; FALLBACK23-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 12(%edx)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 16(%edx)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 20(%edx)
+; FALLBACK23-NEXT:    movl %eax, (%edx)
+; FALLBACK23-NEXT:    addl $92, %esp
+; FALLBACK23-NEXT:    popl %esi
+; FALLBACK23-NEXT:    popl %edi
+; FALLBACK23-NEXT:    popl %ebx
+; FALLBACK23-NEXT:    popl %ebp
+; FALLBACK23-NEXT:    retl
+;
+; FALLBACK24-LABEL: shl_32bytes:
+; FALLBACK24:       # %bb.0:
+; FALLBACK24-NEXT:    pushl %ebp
+; FALLBACK24-NEXT:    pushl %ebx
+; FALLBACK24-NEXT:    pushl %edi
+; FALLBACK24-NEXT:    pushl %esi
+; FALLBACK24-NEXT:    subl $108, %esp
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT:    movzbl (%eax), %ecx
+; FALLBACK24-NEXT:    movb %cl, %dh
+; FALLBACK24-NEXT:    shlb $3, %dh
+; FALLBACK24-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK24-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    andb $28, %cl
+; FALLBACK24-NEXT:    negb %cl
+; FALLBACK24-NEXT:    movsbl %cl, %eax
+; FALLBACK24-NEXT:    movl 84(%esp,%eax), %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    movb %dh, %dl
+; FALLBACK24-NEXT:    notb %dl
+; FALLBACK24-NEXT:    movl 80(%esp,%eax), %esi
+; FALLBACK24-NEXT:    movl %eax, %ebx
+; FALLBACK24-NEXT:    movl %esi, %eax
+; FALLBACK24-NEXT:    shrl %eax
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    orl %edi, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    movl %ebx, %edi
+; FALLBACK24-NEXT:    movl 76(%esp,%ebx), %ebp
+; FALLBACK24-NEXT:    movl %ebp, %eax
+; FALLBACK24-NEXT:    shrl %eax
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    orl %esi, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    movl 72(%esp,%ebx), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %eax
+; FALLBACK24-NEXT:    shrl %eax
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    orl %ebp, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 68(%esp,%edi), %ebp
+; FALLBACK24-NEXT:    movl %ebp, %esi
+; FALLBACK24-NEXT:    shrl %esi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    orl %ebx, %esi
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    movl 64(%esp,%edi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    shrl %ebx
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    orl %ebp, %ebx
+; FALLBACK24-NEXT:    movl 88(%esp,%edi), %ebp
+; FALLBACK24-NEXT:    movl %ebp, %edi
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %eax
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    orl %edi, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl 92(%esp,%eax), %edi
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    shrl %ebp
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    orl %edi, %ebp
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl %edx, (%eax)
+; FALLBACK24-NEXT:    movl %ebp, 28(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK24-NEXT:    movl %ebx, 4(%eax)
+; FALLBACK24-NEXT:    movl %esi, 8(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK24-NEXT:    addl $108, %esp
+; FALLBACK24-NEXT:    popl %esi
+; FALLBACK24-NEXT:    popl %edi
+; FALLBACK24-NEXT:    popl %ebx
+; FALLBACK24-NEXT:    popl %ebp
+; FALLBACK24-NEXT:    vzeroupper
+; FALLBACK24-NEXT:    retl
+;
+; FALLBACK25-LABEL: shl_32bytes:
+; FALLBACK25:       # %bb.0:
+; FALLBACK25-NEXT:    pushl %ebp
+; FALLBACK25-NEXT:    pushl %ebx
+; FALLBACK25-NEXT:    pushl %edi
+; FALLBACK25-NEXT:    pushl %esi
+; FALLBACK25-NEXT:    subl $92, %esp
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK25-NEXT:    movzbl (%eax), %eax
+; FALLBACK25-NEXT:    movl %eax, %ecx
+; FALLBACK25-NEXT:    shlb $3, %cl
+; FALLBACK25-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK25-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    andb $28, %al
+; FALLBACK25-NEXT:    negb %al
+; FALLBACK25-NEXT:    movsbl %al, %ebp
+; FALLBACK25-NEXT:    movl 64(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl 68(%esp,%ebp), %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shldl %cl, %eax, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 60(%esp,%ebp), %edx
+; FALLBACK25-NEXT:    shldl %cl, %edx, %eax
+; FALLBACK25-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 56(%esp,%ebp), %edi
+; FALLBACK25-NEXT:    shldl %cl, %edi, %edx
+; FALLBACK25-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 52(%esp,%ebp), %ebx
+; FALLBACK25-NEXT:    shldl %cl, %ebx, %edi
+; FALLBACK25-NEXT:    movl 72(%esp,%ebp), %edx
+; FALLBACK25-NEXT:    movl %edx, %eax
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK25-NEXT:    shldl %cl, %esi, %eax
+; FALLBACK25-NEXT:    movl 48(%esp,%ebp), %esi
+; FALLBACK25-NEXT:    movl 76(%esp,%ebp), %ebp
+; FALLBACK25-NEXT:    shldl %cl, %edx, %ebp
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK25-NEXT:    movl %ebp, 28(%edx)
+; FALLBACK25-NEXT:    movl %eax, 24(%edx)
+; FALLBACK25-NEXT:    movl %esi, %eax
+; FALLBACK25-NEXT:    shll %cl, %eax
+; FALLBACK25-NEXT:    shldl %cl, %esi, %ebx
+; FALLBACK25-NEXT:    movl %ebx, 4(%edx)
+; FALLBACK25-NEXT:    movl %edi, 8(%edx)
+; FALLBACK25-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 12(%edx)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 16(%edx)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 20(%edx)
+; FALLBACK25-NEXT:    movl %eax, (%edx)
+; FALLBACK25-NEXT:    addl $92, %esp
+; FALLBACK25-NEXT:    popl %esi
+; FALLBACK25-NEXT:    popl %edi
+; FALLBACK25-NEXT:    popl %ebx
+; FALLBACK25-NEXT:    popl %ebp
+; FALLBACK25-NEXT:    vzeroupper
+; FALLBACK25-NEXT:    retl
+;
+; FALLBACK26-LABEL: shl_32bytes:
+; FALLBACK26:       # %bb.0:
+; FALLBACK26-NEXT:    pushl %ebp
+; FALLBACK26-NEXT:    pushl %ebx
+; FALLBACK26-NEXT:    pushl %edi
+; FALLBACK26-NEXT:    pushl %esi
+; FALLBACK26-NEXT:    subl $108, %esp
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT:    movzbl (%eax), %ecx
+; FALLBACK26-NEXT:    movl %ecx, %eax
+; FALLBACK26-NEXT:    shlb $3, %al
+; FALLBACK26-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK26-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    andb $28, %cl
+; FALLBACK26-NEXT:    negb %cl
+; FALLBACK26-NEXT:    movsbl %cl, %edx
+; FALLBACK26-NEXT:    movl 84(%esp,%edx), %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %eax, %ecx, %ecx
+; FALLBACK26-NEXT:    movl 80(%esp,%edx), %esi
+; FALLBACK26-NEXT:    shlxl %eax, %esi, %edi
+; FALLBACK26-NEXT:    movl %eax, %ebx
+; FALLBACK26-NEXT:    notb %bl
+; FALLBACK26-NEXT:    shrl %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK26-NEXT:    orl %ecx, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 76(%esp,%edx), %ecx
+; FALLBACK26-NEXT:    movl %ecx, %esi
+; FALLBACK26-NEXT:    shrl %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK26-NEXT:    orl %edi, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %eax, %ecx, %ecx
+; FALLBACK26-NEXT:    movl 72(%esp,%edx), %esi
+; FALLBACK26-NEXT:    movl %esi, %edi
+; FALLBACK26-NEXT:    shrl %edi
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    orl %ecx, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %eax, %esi, %ecx
+; FALLBACK26-NEXT:    movl 68(%esp,%edx), %esi
+; FALLBACK26-NEXT:    movl %esi, %edi
+; FALLBACK26-NEXT:    shrl %edi
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ebp
+; FALLBACK26-NEXT:    orl %ecx, %ebp
+; FALLBACK26-NEXT:    shlxl %eax, %esi, %edi
+; FALLBACK26-NEXT:    movl 64(%esp,%edx), %esi
+; FALLBACK26-NEXT:    movl %esi, %ecx
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT:    orl %edi, %ecx
+; FALLBACK26-NEXT:    shlxl %eax, %esi, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %eax, 92(%esp,%edx), %edi
+; FALLBACK26-NEXT:    movl 88(%esp,%edx), %edx
+; FALLBACK26-NEXT:    shlxl %eax, %edx, %esi
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    shrl %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT:    orl %esi, %eax
+; FALLBACK26-NEXT:    shrl %edx
+; FALLBACK26-NEXT:    shrxl %ebx, %edx, %edx
+; FALLBACK26-NEXT:    orl %edi, %edx
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK26-NEXT:    movl %edi, (%esi)
+; FALLBACK26-NEXT:    movl %edx, 28(%esi)
+; FALLBACK26-NEXT:    movl %eax, 24(%esi)
+; FALLBACK26-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK26-NEXT:    movl %ebp, 8(%esi)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 12(%esi)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 16(%esi)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 20(%esi)
+; FALLBACK26-NEXT:    addl $108, %esp
+; FALLBACK26-NEXT:    popl %esi
+; FALLBACK26-NEXT:    popl %edi
+; FALLBACK26-NEXT:    popl %ebx
+; FALLBACK26-NEXT:    popl %ebp
+; FALLBACK26-NEXT:    vzeroupper
+; FALLBACK26-NEXT:    retl
+;
+; FALLBACK27-LABEL: shl_32bytes:
+; FALLBACK27:       # %bb.0:
+; FALLBACK27-NEXT:    pushl %ebp
+; FALLBACK27-NEXT:    pushl %ebx
+; FALLBACK27-NEXT:    pushl %edi
+; FALLBACK27-NEXT:    pushl %esi
+; FALLBACK27-NEXT:    subl $92, %esp
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK27-NEXT:    movzbl (%eax), %eax
+; FALLBACK27-NEXT:    movl %eax, %ecx
+; FALLBACK27-NEXT:    shlb $3, %cl
+; FALLBACK27-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK27-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    andb $28, %al
+; FALLBACK27-NEXT:    negb %al
+; FALLBACK27-NEXT:    movsbl %al, %ebx
+; FALLBACK27-NEXT:    movl 64(%esp,%ebx), %eax
+; FALLBACK27-NEXT:    movl 68(%esp,%ebx), %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shldl %cl, %eax, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 60(%esp,%ebx), %edx
+; FALLBACK27-NEXT:    shldl %cl, %edx, %eax
+; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 56(%esp,%ebx), %edi
+; FALLBACK27-NEXT:    shldl %cl, %edi, %edx
+; FALLBACK27-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 52(%esp,%ebx), %ebp
+; FALLBACK27-NEXT:    shldl %cl, %ebp, %edi
+; FALLBACK27-NEXT:    movl 72(%esp,%ebx), %edx
+; FALLBACK27-NEXT:    movl %edx, %eax
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT:    shldl %cl, %esi, %eax
+; FALLBACK27-NEXT:    movl 48(%esp,%ebx), %esi
+; FALLBACK27-NEXT:    movl 76(%esp,%ebx), %ebx
+; FALLBACK27-NEXT:    shldl %cl, %edx, %ebx
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK27-NEXT:    movl %ebx, 28(%edx)
+; FALLBACK27-NEXT:    movl %eax, 24(%edx)
+; FALLBACK27-NEXT:    shlxl %ecx, %esi, %eax
+; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT:    shldl %cl, %esi, %ebp
+; FALLBACK27-NEXT:    movl %ebp, 4(%edx)
+; FALLBACK27-NEXT:    movl %edi, 8(%edx)
+; FALLBACK27-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 12(%edx)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 16(%edx)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 20(%edx)
+; FALLBACK27-NEXT:    movl %eax, (%edx)
+; FALLBACK27-NEXT:    addl $92, %esp
+; FALLBACK27-NEXT:    popl %esi
+; FALLBACK27-NEXT:    popl %edi
+; FALLBACK27-NEXT:    popl %ebx
+; FALLBACK27-NEXT:    popl %ebp
+; FALLBACK27-NEXT:    vzeroupper
+; FALLBACK27-NEXT:    retl
+;
+; FALLBACK28-LABEL: shl_32bytes:
+; FALLBACK28:       # %bb.0:
+; FALLBACK28-NEXT:    pushl %ebp
+; FALLBACK28-NEXT:    pushl %ebx
+; FALLBACK28-NEXT:    pushl %edi
+; FALLBACK28-NEXT:    pushl %esi
+; FALLBACK28-NEXT:    subl $108, %esp
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK28-NEXT:    movzbl (%eax), %ecx
+; FALLBACK28-NEXT:    movb %cl, %dh
+; FALLBACK28-NEXT:    shlb $3, %dh
+; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    andb $28, %cl
+; FALLBACK28-NEXT:    negb %cl
+; FALLBACK28-NEXT:    movsbl %cl, %eax
+; FALLBACK28-NEXT:    movl 84(%esp,%eax), %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    movb %dh, %dl
+; FALLBACK28-NEXT:    notb %dl
+; FALLBACK28-NEXT:    movl 80(%esp,%eax), %esi
+; FALLBACK28-NEXT:    movl %eax, %ebx
+; FALLBACK28-NEXT:    movl %esi, %eax
+; FALLBACK28-NEXT:    shrl %eax
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    orl %edi, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    movl %ebx, %edi
+; FALLBACK28-NEXT:    movl 76(%esp,%ebx), %ebp
+; FALLBACK28-NEXT:    movl %ebp, %eax
+; FALLBACK28-NEXT:    shrl %eax
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    orl %esi, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    movl 72(%esp,%ebx), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %eax
+; FALLBACK28-NEXT:    shrl %eax
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    orl %ebp, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 68(%esp,%edi), %ebp
+; FALLBACK28-NEXT:    movl %ebp, %esi
+; FALLBACK28-NEXT:    shrl %esi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    orl %ebx, %esi
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    movl 64(%esp,%edi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    shrl %ebx
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    orl %ebp, %ebx
+; FALLBACK28-NEXT:    movl 88(%esp,%edi), %ebp
+; FALLBACK28-NEXT:    movl %ebp, %edi
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %eax
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    orl %edi, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl 92(%esp,%eax), %edi
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    shrl %ebp
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    orl %edi, %ebp
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl %edx, (%eax)
+; FALLBACK28-NEXT:    movl %ebp, 28(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK28-NEXT:    movl %ebx, 4(%eax)
+; FALLBACK28-NEXT:    movl %esi, 8(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK28-NEXT:    addl $108, %esp
+; FALLBACK28-NEXT:    popl %esi
+; FALLBACK28-NEXT:    popl %edi
+; FALLBACK28-NEXT:    popl %ebx
+; FALLBACK28-NEXT:    popl %ebp
+; FALLBACK28-NEXT:    vzeroupper
+; FALLBACK28-NEXT:    retl
+;
+; FALLBACK29-LABEL: shl_32bytes:
+; FALLBACK29:       # %bb.0:
+; FALLBACK29-NEXT:    pushl %ebp
+; FALLBACK29-NEXT:    pushl %ebx
+; FALLBACK29-NEXT:    pushl %edi
+; FALLBACK29-NEXT:    pushl %esi
+; FALLBACK29-NEXT:    subl $92, %esp
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK29-NEXT:    movzbl (%eax), %eax
+; FALLBACK29-NEXT:    movl %eax, %ecx
+; FALLBACK29-NEXT:    shlb $3, %cl
+; FALLBACK29-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    andb $28, %al
+; FALLBACK29-NEXT:    negb %al
+; FALLBACK29-NEXT:    movsbl %al, %ebp
+; FALLBACK29-NEXT:    movl 64(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl 68(%esp,%ebp), %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shldl %cl, %eax, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 60(%esp,%ebp), %edx
+; FALLBACK29-NEXT:    shldl %cl, %edx, %eax
+; FALLBACK29-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 56(%esp,%ebp), %edi
+; FALLBACK29-NEXT:    shldl %cl, %edi, %edx
+; FALLBACK29-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 52(%esp,%ebp), %ebx
+; FALLBACK29-NEXT:    shldl %cl, %ebx, %edi
+; FALLBACK29-NEXT:    movl 72(%esp,%ebp), %edx
+; FALLBACK29-NEXT:    movl %edx, %eax
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK29-NEXT:    shldl %cl, %esi, %eax
+; FALLBACK29-NEXT:    movl 48(%esp,%ebp), %esi
+; FALLBACK29-NEXT:    movl 76(%esp,%ebp), %ebp
+; FALLBACK29-NEXT:    shldl %cl, %edx, %ebp
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK29-NEXT:    movl %ebp, 28(%edx)
+; FALLBACK29-NEXT:    movl %eax, 24(%edx)
+; FALLBACK29-NEXT:    movl %esi, %eax
+; FALLBACK29-NEXT:    shll %cl, %eax
+; FALLBACK29-NEXT:    shldl %cl, %esi, %ebx
+; FALLBACK29-NEXT:    movl %ebx, 4(%edx)
+; FALLBACK29-NEXT:    movl %edi, 8(%edx)
+; FALLBACK29-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 12(%edx)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 16(%edx)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 20(%edx)
+; FALLBACK29-NEXT:    movl %eax, (%edx)
+; FALLBACK29-NEXT:    addl $92, %esp
+; FALLBACK29-NEXT:    popl %esi
+; FALLBACK29-NEXT:    popl %edi
+; FALLBACK29-NEXT:    popl %ebx
+; FALLBACK29-NEXT:    popl %ebp
+; FALLBACK29-NEXT:    vzeroupper
+; FALLBACK29-NEXT:    retl
+;
+; FALLBACK30-LABEL: shl_32bytes:
+; FALLBACK30:       # %bb.0:
+; FALLBACK30-NEXT:    pushl %ebp
+; FALLBACK30-NEXT:    pushl %ebx
+; FALLBACK30-NEXT:    pushl %edi
+; FALLBACK30-NEXT:    pushl %esi
+; FALLBACK30-NEXT:    subl $108, %esp
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK30-NEXT:    movzbl (%eax), %ecx
+; FALLBACK30-NEXT:    movl %ecx, %eax
+; FALLBACK30-NEXT:    shlb $3, %al
+; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    andb $28, %cl
+; FALLBACK30-NEXT:    negb %cl
+; FALLBACK30-NEXT:    movsbl %cl, %edx
+; FALLBACK30-NEXT:    movl 84(%esp,%edx), %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %eax, %ecx, %ecx
+; FALLBACK30-NEXT:    movl 80(%esp,%edx), %esi
+; FALLBACK30-NEXT:    shlxl %eax, %esi, %edi
+; FALLBACK30-NEXT:    movl %eax, %ebx
+; FALLBACK30-NEXT:    notb %bl
+; FALLBACK30-NEXT:    shrl %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK30-NEXT:    orl %ecx, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 76(%esp,%edx), %ecx
+; FALLBACK30-NEXT:    movl %ecx, %esi
+; FALLBACK30-NEXT:    shrl %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK30-NEXT:    orl %edi, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %eax, %ecx, %ecx
+; FALLBACK30-NEXT:    movl 72(%esp,%edx), %esi
+; FALLBACK30-NEXT:    movl %esi, %edi
+; FALLBACK30-NEXT:    shrl %edi
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    orl %ecx, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %eax, %esi, %ecx
+; FALLBACK30-NEXT:    movl 68(%esp,%edx), %esi
+; FALLBACK30-NEXT:    movl %esi, %edi
+; FALLBACK30-NEXT:    shrl %edi
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ebp
+; FALLBACK30-NEXT:    orl %ecx, %ebp
+; FALLBACK30-NEXT:    shlxl %eax, %esi, %edi
+; FALLBACK30-NEXT:    movl 64(%esp,%edx), %esi
+; FALLBACK30-NEXT:    movl %esi, %ecx
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT:    orl %edi, %ecx
+; FALLBACK30-NEXT:    shlxl %eax, %esi, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %eax, 92(%esp,%edx), %edi
+; FALLBACK30-NEXT:    movl 88(%esp,%edx), %edx
+; FALLBACK30-NEXT:    shlxl %eax, %edx, %esi
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    shrl %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT:    orl %esi, %eax
+; FALLBACK30-NEXT:    shrl %edx
+; FALLBACK30-NEXT:    shrxl %ebx, %edx, %edx
+; FALLBACK30-NEXT:    orl %edi, %edx
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK30-NEXT:    movl %edi, (%esi)
+; FALLBACK30-NEXT:    movl %edx, 28(%esi)
+; FALLBACK30-NEXT:    movl %eax, 24(%esi)
+; FALLBACK30-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK30-NEXT:    movl %ebp, 8(%esi)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 12(%esi)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 16(%esi)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 20(%esi)
+; FALLBACK30-NEXT:    addl $108, %esp
+; FALLBACK30-NEXT:    popl %esi
+; FALLBACK30-NEXT:    popl %edi
+; FALLBACK30-NEXT:    popl %ebx
+; FALLBACK30-NEXT:    popl %ebp
+; FALLBACK30-NEXT:    vzeroupper
+; FALLBACK30-NEXT:    retl
+;
+; FALLBACK31-LABEL: shl_32bytes:
+; FALLBACK31:       # %bb.0:
+; FALLBACK31-NEXT:    pushl %ebp
+; FALLBACK31-NEXT:    pushl %ebx
+; FALLBACK31-NEXT:    pushl %edi
+; FALLBACK31-NEXT:    pushl %esi
+; FALLBACK31-NEXT:    subl $92, %esp
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK31-NEXT:    movzbl (%eax), %eax
+; FALLBACK31-NEXT:    movl %eax, %ecx
+; FALLBACK31-NEXT:    shlb $3, %cl
+; FALLBACK31-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    andb $28, %al
+; FALLBACK31-NEXT:    negb %al
+; FALLBACK31-NEXT:    movsbl %al, %ebx
+; FALLBACK31-NEXT:    movl 64(%esp,%ebx), %eax
+; FALLBACK31-NEXT:    movl 68(%esp,%ebx), %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shldl %cl, %eax, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 60(%esp,%ebx), %edx
+; FALLBACK31-NEXT:    shldl %cl, %edx, %eax
+; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 56(%esp,%ebx), %edi
+; FALLBACK31-NEXT:    shldl %cl, %edi, %edx
+; FALLBACK31-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 52(%esp,%ebx), %ebp
+; FALLBACK31-NEXT:    shldl %cl, %ebp, %edi
+; FALLBACK31-NEXT:    movl 72(%esp,%ebx), %edx
+; FALLBACK31-NEXT:    movl %edx, %eax
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT:    shldl %cl, %esi, %eax
+; FALLBACK31-NEXT:    movl 48(%esp,%ebx), %esi
+; FALLBACK31-NEXT:    movl 76(%esp,%ebx), %ebx
+; FALLBACK31-NEXT:    shldl %cl, %edx, %ebx
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK31-NEXT:    movl %ebx, 28(%edx)
+; FALLBACK31-NEXT:    movl %eax, 24(%edx)
+; FALLBACK31-NEXT:    shlxl %ecx, %esi, %eax
+; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT:    shldl %cl, %esi, %ebp
+; FALLBACK31-NEXT:    movl %ebp, 4(%edx)
+; FALLBACK31-NEXT:    movl %edi, 8(%edx)
+; FALLBACK31-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 12(%edx)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 16(%edx)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 20(%edx)
+; FALLBACK31-NEXT:    movl %eax, (%edx)
+; FALLBACK31-NEXT:    addl $92, %esp
+; FALLBACK31-NEXT:    popl %esi
+; FALLBACK31-NEXT:    popl %edi
+; FALLBACK31-NEXT:    popl %ebx
+; FALLBACK31-NEXT:    popl %ebp
+; FALLBACK31-NEXT:    vzeroupper
+; FALLBACK31-NEXT:    retl
+  %src = load i256, ptr %src.ptr, align 1
+  %byteOff = load i256, ptr %byteOff.ptr, align 1
+  %bitOff = shl i256 %byteOff, 3
+  %res = shl i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; FALLBACK0-LABEL: shl_32bytes_dwordOff:
+; FALLBACK0:       # %bb.0:
+; FALLBACK0-NEXT:    pushq %rbx
+; FALLBACK0-NEXT:    movq (%rdi), %rcx
+; FALLBACK0-NEXT:    movq 8(%rdi), %r8
+; FALLBACK0-NEXT:    movq 16(%rdi), %r9
+; FALLBACK0-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK0-NEXT:    movzbl (%rsi), %esi
+; FALLBACK0-NEXT:    movl %esi, %eax
+; FALLBACK0-NEXT:    shlb $5, %al
+; FALLBACK0-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    shlb $2, %sil
+; FALLBACK0-NEXT:    andb $24, %sil
+; FALLBACK0-NEXT:    negb %sil
+; FALLBACK0-NEXT:    movsbq %sil, %r10
+; FALLBACK0-NEXT:    movq -32(%rsp,%r10), %r8
+; FALLBACK0-NEXT:    movq -24(%rsp,%r10), %rdi
+; FALLBACK0-NEXT:    movq %rdi, %r11
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r11
+; FALLBACK0-NEXT:    movl %eax, %esi
+; FALLBACK0-NEXT:    notb %sil
+; FALLBACK0-NEXT:    movq %r8, %r9
+; FALLBACK0-NEXT:    shrq %r9
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r9
+; FALLBACK0-NEXT:    orq %r11, %r9
+; FALLBACK0-NEXT:    movq -8(%rsp,%r10), %r11
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r11
+; FALLBACK0-NEXT:    movq -16(%rsp,%r10), %r10
+; FALLBACK0-NEXT:    movq %r10, %rbx
+; FALLBACK0-NEXT:    shrq %rbx
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %rbx
+; FALLBACK0-NEXT:    orq %r11, %rbx
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r10
+; FALLBACK0-NEXT:    shrq %rdi
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %rdi
+; FALLBACK0-NEXT:    orq %r10, %rdi
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r8
+; FALLBACK0-NEXT:    movq %r8, (%rdx)
+; FALLBACK0-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK0-NEXT:    movq %rbx, 24(%rdx)
+; FALLBACK0-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK0-NEXT:    popq %rbx
+; FALLBACK0-NEXT:    retq
+;
+; FALLBACK1-LABEL: shl_32bytes_dwordOff:
+; FALLBACK1:       # %bb.0:
+; FALLBACK1-NEXT:    movq (%rdi), %rax
+; FALLBACK1-NEXT:    movq 8(%rdi), %r8
+; FALLBACK1-NEXT:    movq 16(%rdi), %r9
+; FALLBACK1-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK1-NEXT:    movzbl (%rsi), %esi
+; FALLBACK1-NEXT:    movl %esi, %ecx
+; FALLBACK1-NEXT:    shlb $5, %cl
+; FALLBACK1-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    shlb $2, %sil
+; FALLBACK1-NEXT:    andb $24, %sil
+; FALLBACK1-NEXT:    negb %sil
+; FALLBACK1-NEXT:    movsbq %sil, %rax
+; FALLBACK1-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK1-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK1-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK1-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK1-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK1-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK1-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK1-NEXT:    shlq %cl, %r8
+; FALLBACK1-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK1-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK1-NEXT:    movq %r8, (%rdx)
+; FALLBACK1-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK1-NEXT:    retq
+;
+; FALLBACK2-LABEL: shl_32bytes_dwordOff:
+; FALLBACK2:       # %bb.0:
+; FALLBACK2-NEXT:    movq (%rdi), %rcx
+; FALLBACK2-NEXT:    movq 8(%rdi), %r8
+; FALLBACK2-NEXT:    movq 16(%rdi), %r9
+; FALLBACK2-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK2-NEXT:    movzbl (%rsi), %esi
+; FALLBACK2-NEXT:    movl %esi, %eax
+; FALLBACK2-NEXT:    shlb $5, %al
+; FALLBACK2-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    shlb $2, %sil
+; FALLBACK2-NEXT:    andb $24, %sil
+; FALLBACK2-NEXT:    negb %sil
+; FALLBACK2-NEXT:    movsbq %sil, %rsi
+; FALLBACK2-NEXT:    movq -40(%rsp,%rsi), %rdi
+; FALLBACK2-NEXT:    movq -32(%rsp,%rsi), %rcx
+; FALLBACK2-NEXT:    shlxq %rax, %rcx, %r8
+; FALLBACK2-NEXT:    shlxq %rax, -16(%rsp,%rsi), %r9
+; FALLBACK2-NEXT:    movq -24(%rsp,%rsi), %rsi
+; FALLBACK2-NEXT:    shlxq %rax, %rsi, %r10
+; FALLBACK2-NEXT:    shlxq %rax, %rdi, %r11
+; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    shrq %rdi
+; FALLBACK2-NEXT:    shrxq %rax, %rdi, %rdi
+; FALLBACK2-NEXT:    orq %r8, %rdi
+; FALLBACK2-NEXT:    shrq %rsi
+; FALLBACK2-NEXT:    shrxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT:    orq %r9, %rsi
+; FALLBACK2-NEXT:    shrq %rcx
+; FALLBACK2-NEXT:    shrxq %rax, %rcx, %rax
+; FALLBACK2-NEXT:    orq %r10, %rax
+; FALLBACK2-NEXT:    movq %r11, (%rdx)
+; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
+; FALLBACK2-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT:    retq
+;
+; FALLBACK3-LABEL: shl_32bytes_dwordOff:
+; FALLBACK3:       # %bb.0:
+; FALLBACK3-NEXT:    movq (%rdi), %rax
+; FALLBACK3-NEXT:    movq 8(%rdi), %r8
+; FALLBACK3-NEXT:    movq 16(%rdi), %r9
+; FALLBACK3-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK3-NEXT:    movzbl (%rsi), %esi
+; FALLBACK3-NEXT:    movl %esi, %ecx
+; FALLBACK3-NEXT:    shlb $5, %cl
+; FALLBACK3-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    shlb $2, %sil
+; FALLBACK3-NEXT:    andb $24, %sil
+; FALLBACK3-NEXT:    negb %sil
+; FALLBACK3-NEXT:    movsbq %sil, %rax
+; FALLBACK3-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK3-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK3-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK3-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK3-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK3-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK3-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK3-NEXT:    shlxq %rcx, %r8, %rcx
+; FALLBACK3-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK3-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK3-NEXT:    movq %rcx, (%rdx)
+; FALLBACK3-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK3-NEXT:    retq
+;
+; FALLBACK4-LABEL: shl_32bytes_dwordOff:
+; FALLBACK4:       # %bb.0:
+; FALLBACK4-NEXT:    movups (%rdi), %xmm0
+; FALLBACK4-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK4-NEXT:    movl %ecx, %eax
+; FALLBACK4-NEXT:    shlb $5, %al
+; FALLBACK4-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    shlb $2, %cl
+; FALLBACK4-NEXT:    andb $24, %cl
+; FALLBACK4-NEXT:    negb %cl
+; FALLBACK4-NEXT:    movsbq %cl, %r8
+; FALLBACK4-NEXT:    movq -16(%rsp,%r8), %r9
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r9
+; FALLBACK4-NEXT:    movl %eax, %esi
+; FALLBACK4-NEXT:    notb %sil
+; FALLBACK4-NEXT:    movq -24(%rsp,%r8), %r10
+; FALLBACK4-NEXT:    movq %r10, %rdi
+; FALLBACK4-NEXT:    shrq %rdi
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %rdi
+; FALLBACK4-NEXT:    orq %r9, %rdi
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r10
+; FALLBACK4-NEXT:    movq -40(%rsp,%r8), %r9
+; FALLBACK4-NEXT:    movq -32(%rsp,%r8), %r8
+; FALLBACK4-NEXT:    movq %r8, %r11
+; FALLBACK4-NEXT:    shrq %r11
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r11
+; FALLBACK4-NEXT:    orq %r10, %r11
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r8
+; FALLBACK4-NEXT:    movq %r9, %r10
+; FALLBACK4-NEXT:    shrq %r10
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r10
+; FALLBACK4-NEXT:    orq %r8, %r10
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r9
+; FALLBACK4-NEXT:    movq %r9, (%rdx)
+; FALLBACK4-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK4-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK4-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK4-NEXT:    retq
+;
+; FALLBACK5-LABEL: shl_32bytes_dwordOff:
+; FALLBACK5:       # %bb.0:
+; FALLBACK5-NEXT:    movups (%rdi), %xmm0
+; FALLBACK5-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT:    movzbl (%rsi), %eax
+; FALLBACK5-NEXT:    movl %eax, %ecx
+; FALLBACK5-NEXT:    shlb $5, %cl
+; FALLBACK5-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    shlb $2, %al
+; FALLBACK5-NEXT:    andb $24, %al
+; FALLBACK5-NEXT:    negb %al
+; FALLBACK5-NEXT:    movsbq %al, %rax
+; FALLBACK5-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK5-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK5-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK5-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK5-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK5-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK5-NEXT:    movq %r8, %r9
+; FALLBACK5-NEXT:    shlq %cl, %r9
+; FALLBACK5-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK5-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK5-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK5-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK5-NEXT:    movq %r9, (%rdx)
+; FALLBACK5-NEXT:    retq
+;
+; FALLBACK6-LABEL: shl_32bytes_dwordOff:
+; FALLBACK6:       # %bb.0:
+; FALLBACK6-NEXT:    movups (%rdi), %xmm0
+; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK6-NEXT:    movl %ecx, %eax
+; FALLBACK6-NEXT:    shlb $5, %al
+; FALLBACK6-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    shlb $2, %cl
+; FALLBACK6-NEXT:    andb $24, %cl
+; FALLBACK6-NEXT:    negb %cl
+; FALLBACK6-NEXT:    movsbq %cl, %rcx
+; FALLBACK6-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK6-NEXT:    movq -24(%rsp,%rcx), %rdi
+; FALLBACK6-NEXT:    shlxq %rax, %rdi, %r8
+; FALLBACK6-NEXT:    movq -40(%rsp,%rcx), %r9
+; FALLBACK6-NEXT:    movq -32(%rsp,%rcx), %rcx
+; FALLBACK6-NEXT:    shlxq %rax, %rcx, %r10
+; FALLBACK6-NEXT:    shlxq %rax, %r9, %r11
+; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT:    notb %al
+; FALLBACK6-NEXT:    shrq %rdi
+; FALLBACK6-NEXT:    shrxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT:    orq %rsi, %rdi
+; FALLBACK6-NEXT:    shrq %rcx
+; FALLBACK6-NEXT:    shrxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT:    orq %r8, %rcx
+; FALLBACK6-NEXT:    shrq %r9
+; FALLBACK6-NEXT:    shrxq %rax, %r9, %rax
+; FALLBACK6-NEXT:    orq %r10, %rax
+; FALLBACK6-NEXT:    movq %r11, (%rdx)
+; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK6-NEXT:    retq
+;
+; FALLBACK7-LABEL: shl_32bytes_dwordOff:
+; FALLBACK7:       # %bb.0:
+; FALLBACK7-NEXT:    movups (%rdi), %xmm0
+; FALLBACK7-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT:    movzbl (%rsi), %eax
+; FALLBACK7-NEXT:    movl %eax, %ecx
+; FALLBACK7-NEXT:    shlb $5, %cl
+; FALLBACK7-NEXT:    xorps %xmm2, %xmm2
+; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    shlb $2, %al
+; FALLBACK7-NEXT:    andb $24, %al
+; FALLBACK7-NEXT:    negb %al
+; FALLBACK7-NEXT:    movsbq %al, %rax
+; FALLBACK7-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK7-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK7-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK7-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK7-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK7-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK7-NEXT:    shlxq %rcx, %r8, %r9
+; FALLBACK7-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK7-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK7-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK7-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK7-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK7-NEXT:    movq %r9, (%rdx)
+; FALLBACK7-NEXT:    retq
+;
+; FALLBACK8-LABEL: shl_32bytes_dwordOff:
+; FALLBACK8:       # %bb.0:
+; FALLBACK8-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK8-NEXT:    movl %ecx, %eax
+; FALLBACK8-NEXT:    shlb $5, %al
+; FALLBACK8-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK8-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    shlb $2, %cl
+; FALLBACK8-NEXT:    andb $24, %cl
+; FALLBACK8-NEXT:    negb %cl
+; FALLBACK8-NEXT:    movsbq %cl, %r8
+; FALLBACK8-NEXT:    movq -16(%rsp,%r8), %r9
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r9
+; FALLBACK8-NEXT:    movl %eax, %esi
+; FALLBACK8-NEXT:    notb %sil
+; FALLBACK8-NEXT:    movq -24(%rsp,%r8), %r10
+; FALLBACK8-NEXT:    movq %r10, %rdi
+; FALLBACK8-NEXT:    shrq %rdi
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %rdi
+; FALLBACK8-NEXT:    orq %r9, %rdi
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r10
+; FALLBACK8-NEXT:    movq -40(%rsp,%r8), %r9
+; FALLBACK8-NEXT:    movq -32(%rsp,%r8), %r8
+; FALLBACK8-NEXT:    movq %r8, %r11
+; FALLBACK8-NEXT:    shrq %r11
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r11
+; FALLBACK8-NEXT:    orq %r10, %r11
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r8
+; FALLBACK8-NEXT:    movq %r9, %r10
+; FALLBACK8-NEXT:    shrq %r10
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r10
+; FALLBACK8-NEXT:    orq %r8, %r10
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r9
+; FALLBACK8-NEXT:    movq %r9, (%rdx)
+; FALLBACK8-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK8-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK8-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK8-NEXT:    vzeroupper
+; FALLBACK8-NEXT:    retq
+;
+; FALLBACK9-LABEL: shl_32bytes_dwordOff:
+; FALLBACK9:       # %bb.0:
+; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT:    movzbl (%rsi), %eax
+; FALLBACK9-NEXT:    movl %eax, %ecx
+; FALLBACK9-NEXT:    shlb $5, %cl
+; FALLBACK9-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK9-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    shlb $2, %al
+; FALLBACK9-NEXT:    andb $24, %al
+; FALLBACK9-NEXT:    negb %al
+; FALLBACK9-NEXT:    movsbq %al, %rax
+; FALLBACK9-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK9-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK9-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK9-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK9-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK9-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK9-NEXT:    movq %r8, %r9
+; FALLBACK9-NEXT:    shlq %cl, %r9
+; FALLBACK9-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK9-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK9-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK9-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK9-NEXT:    movq %r9, (%rdx)
+; FALLBACK9-NEXT:    vzeroupper
+; FALLBACK9-NEXT:    retq
+;
+; FALLBACK10-LABEL: shl_32bytes_dwordOff:
+; FALLBACK10:       # %bb.0:
+; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK10-NEXT:    movl %ecx, %eax
+; FALLBACK10-NEXT:    shlb $5, %al
+; FALLBACK10-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    shlb $2, %cl
+; FALLBACK10-NEXT:    andb $24, %cl
+; FALLBACK10-NEXT:    negb %cl
+; FALLBACK10-NEXT:    movsbq %cl, %rcx
+; FALLBACK10-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK10-NEXT:    movq -24(%rsp,%rcx), %rdi
+; FALLBACK10-NEXT:    shlxq %rax, %rdi, %r8
+; FALLBACK10-NEXT:    movq -40(%rsp,%rcx), %r9
+; FALLBACK10-NEXT:    movq -32(%rsp,%rcx), %rcx
+; FALLBACK10-NEXT:    shlxq %rax, %rcx, %r10
+; FALLBACK10-NEXT:    shlxq %rax, %r9, %r11
+; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT:    notb %al
+; FALLBACK10-NEXT:    shrq %rdi
+; FALLBACK10-NEXT:    shrxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT:    orq %rsi, %rdi
+; FALLBACK10-NEXT:    shrq %rcx
+; FALLBACK10-NEXT:    shrxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT:    orq %r8, %rcx
+; FALLBACK10-NEXT:    shrq %r9
+; FALLBACK10-NEXT:    shrxq %rax, %r9, %rax
+; FALLBACK10-NEXT:    orq %r10, %rax
+; FALLBACK10-NEXT:    movq %r11, (%rdx)
+; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK10-NEXT:    vzeroupper
+; FALLBACK10-NEXT:    retq
+;
+; FALLBACK11-LABEL: shl_32bytes_dwordOff:
+; FALLBACK11:       # %bb.0:
+; FALLBACK11-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT:    movzbl (%rsi), %eax
+; FALLBACK11-NEXT:    movl %eax, %ecx
+; FALLBACK11-NEXT:    shlb $5, %cl
+; FALLBACK11-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK11-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    shlb $2, %al
+; FALLBACK11-NEXT:    andb $24, %al
+; FALLBACK11-NEXT:    negb %al
+; FALLBACK11-NEXT:    movsbq %al, %rax
+; FALLBACK11-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK11-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK11-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK11-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK11-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK11-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK11-NEXT:    shlxq %rcx, %r8, %r9
+; FALLBACK11-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK11-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK11-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK11-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK11-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK11-NEXT:    movq %r9, (%rdx)
+; FALLBACK11-NEXT:    vzeroupper
+; FALLBACK11-NEXT:    retq
+;
+; FALLBACK12-LABEL: shl_32bytes_dwordOff:
+; FALLBACK12:       # %bb.0:
+; FALLBACK12-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK12-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK12-NEXT:    movl %ecx, %eax
+; FALLBACK12-NEXT:    shlb $5, %al
+; FALLBACK12-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    shlb $2, %cl
+; FALLBACK12-NEXT:    andb $24, %cl
+; FALLBACK12-NEXT:    negb %cl
+; FALLBACK12-NEXT:    movsbq %cl, %r8
+; FALLBACK12-NEXT:    movq -16(%rsp,%r8), %r9
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r9
+; FALLBACK12-NEXT:    movl %eax, %esi
+; FALLBACK12-NEXT:    notb %sil
+; FALLBACK12-NEXT:    movq -24(%rsp,%r8), %r10
+; FALLBACK12-NEXT:    movq %r10, %rdi
+; FALLBACK12-NEXT:    shrq %rdi
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %rdi
+; FALLBACK12-NEXT:    orq %r9, %rdi
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r10
+; FALLBACK12-NEXT:    movq -40(%rsp,%r8), %r9
+; FALLBACK12-NEXT:    movq -32(%rsp,%r8), %r8
+; FALLBACK12-NEXT:    movq %r8, %r11
+; FALLBACK12-NEXT:    shrq %r11
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r11
+; FALLBACK12-NEXT:    orq %r10, %r11
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r8
+; FALLBACK12-NEXT:    movq %r9, %r10
+; FALLBACK12-NEXT:    shrq %r10
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r10
+; FALLBACK12-NEXT:    orq %r8, %r10
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r9
+; FALLBACK12-NEXT:    movq %r9, (%rdx)
+; FALLBACK12-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK12-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK12-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK12-NEXT:    vzeroupper
+; FALLBACK12-NEXT:    retq
+;
+; FALLBACK13-LABEL: shl_32bytes_dwordOff:
+; FALLBACK13:       # %bb.0:
+; FALLBACK13-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK13-NEXT:    movzbl (%rsi), %eax
+; FALLBACK13-NEXT:    movl %eax, %ecx
+; FALLBACK13-NEXT:    shlb $5, %cl
+; FALLBACK13-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    shlb $2, %al
+; FALLBACK13-NEXT:    andb $24, %al
+; FALLBACK13-NEXT:    negb %al
+; FALLBACK13-NEXT:    movsbq %al, %rax
+; FALLBACK13-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK13-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK13-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK13-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK13-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK13-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK13-NEXT:    movq %r8, %r9
+; FALLBACK13-NEXT:    shlq %cl, %r9
+; FALLBACK13-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK13-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK13-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK13-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK13-NEXT:    movq %r9, (%rdx)
+; FALLBACK13-NEXT:    vzeroupper
+; FALLBACK13-NEXT:    retq
+;
+; FALLBACK14-LABEL: shl_32bytes_dwordOff:
+; FALLBACK14:       # %bb.0:
+; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK14-NEXT:    movzbl (%rsi), %ecx
+; FALLBACK14-NEXT:    movl %ecx, %eax
+; FALLBACK14-NEXT:    shlb $5, %al
+; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    shlb $2, %cl
+; FALLBACK14-NEXT:    andb $24, %cl
+; FALLBACK14-NEXT:    negb %cl
+; FALLBACK14-NEXT:    movsbq %cl, %rcx
+; FALLBACK14-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK14-NEXT:    movq -24(%rsp,%rcx), %rdi
+; FALLBACK14-NEXT:    shlxq %rax, %rdi, %r8
+; FALLBACK14-NEXT:    movq -40(%rsp,%rcx), %r9
+; FALLBACK14-NEXT:    movq -32(%rsp,%rcx), %rcx
+; FALLBACK14-NEXT:    shlxq %rax, %rcx, %r10
+; FALLBACK14-NEXT:    shlxq %rax, %r9, %r11
+; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT:    notb %al
+; FALLBACK14-NEXT:    shrq %rdi
+; FALLBACK14-NEXT:    shrxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT:    orq %rsi, %rdi
+; FALLBACK14-NEXT:    shrq %rcx
+; FALLBACK14-NEXT:    shrxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT:    orq %r8, %rcx
+; FALLBACK14-NEXT:    shrq %r9
+; FALLBACK14-NEXT:    shrxq %rax, %r9, %rax
+; FALLBACK14-NEXT:    orq %r10, %rax
+; FALLBACK14-NEXT:    movq %r11, (%rdx)
+; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK14-NEXT:    vzeroupper
+; FALLBACK14-NEXT:    retq
+;
+; FALLBACK15-LABEL: shl_32bytes_dwordOff:
+; FALLBACK15:       # %bb.0:
+; FALLBACK15-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK15-NEXT:    movzbl (%rsi), %eax
+; FALLBACK15-NEXT:    movl %eax, %ecx
+; FALLBACK15-NEXT:    shlb $5, %cl
+; FALLBACK15-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    shlb $2, %al
+; FALLBACK15-NEXT:    andb $24, %al
+; FALLBACK15-NEXT:    negb %al
+; FALLBACK15-NEXT:    movsbq %al, %rax
+; FALLBACK15-NEXT:    movq -24(%rsp,%rax), %rsi
+; FALLBACK15-NEXT:    movq -16(%rsp,%rax), %rdi
+; FALLBACK15-NEXT:    shldq %cl, %rsi, %rdi
+; FALLBACK15-NEXT:    movq -40(%rsp,%rax), %r8
+; FALLBACK15-NEXT:    movq -32(%rsp,%rax), %rax
+; FALLBACK15-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK15-NEXT:    shlxq %rcx, %r8, %r9
+; FALLBACK15-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK15-NEXT:    shldq %cl, %r8, %rax
+; FALLBACK15-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK15-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK15-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK15-NEXT:    movq %r9, (%rdx)
+; FALLBACK15-NEXT:    vzeroupper
+; FALLBACK15-NEXT:    retq
+;
+; X86-SSE2-LABEL: shl_32bytes_dwordOff:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    pushl %ebx
+; X86-SSE2-NEXT:    pushl %edi
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    subl $92, %esp
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-SSE2-NEXT:    movl (%ebp), %eax
 ; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 44(%esp,%edx), %eax
-; X86-SSE2-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 52(%esp,%edx), %esi
-; X86-SSE2-NEXT:    movl 48(%esp,%edx), %edi
-; X86-SSE2-NEXT:    movl 60(%esp,%edx), %ebx
-; X86-SSE2-NEXT:    movl 56(%esp,%edx), %ebp
-; X86-SSE2-NEXT:    movl 68(%esp,%edx), %ecx
-; X86-SSE2-NEXT:    movl 64(%esp,%edx), %edx
+; X86-SSE2-NEXT:    movl 4(%ebp), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 8(%ebp), %esi
+; X86-SSE2-NEXT:    movl 12(%ebp), %edi
+; X86-SSE2-NEXT:    movl 16(%ebp), %ebx
+; X86-SSE2-NEXT:    movzbl (%ecx), %ecx
+; X86-SSE2-NEXT:    movl 20(%ebp), %edx
+; X86-SSE2-NEXT:    movl 24(%ebp), %eax
+; X86-SSE2-NEXT:    movl 28(%ebp), %ebp
+; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    shlb $2, %cl
+; X86-SSE2-NEXT:    andb $28, %cl
+; X86-SSE2-NEXT:    negb %cl
+; X86-SSE2-NEXT:    movsbl %cl, %edx
+; X86-SSE2-NEXT:    movl 48(%esp,%edx), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 52(%esp,%edx), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 60(%esp,%edx), %esi
+; X86-SSE2-NEXT:    movl 56(%esp,%edx), %edi
+; X86-SSE2-NEXT:    movl 68(%esp,%edx), %ebx
+; X86-SSE2-NEXT:    movl 64(%esp,%edx), %ebp
+; X86-SSE2-NEXT:    movl 76(%esp,%edx), %ecx
+; X86-SSE2-NEXT:    movl 72(%esp,%edx), %edx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl %edx, 24(%eax)
 ; X86-SSE2-NEXT:    movl %ecx, 28(%eax)
@@ -1368,18 +8554,205 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    movl %esi, 12(%eax)
 ; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    addl $72, %esp
+; X86-SSE2-NEXT:    addl $92, %esp
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    popl %ebx
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-SSE42-LABEL: shl_32bytes:
+; X86-SSE42-LABEL: shl_32bytes_dwordOff:
 ; X86-SSE42:       # %bb.0:
-; X86-SSE42-NEXT:    subl $64, %esp
+; X86-SSE42-NEXT:    subl $76, %esp
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE42-NEXT:    movups (%edx), %xmm0
+; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
+; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
+; X86-SSE42-NEXT:    xorps %xmm2, %xmm2
+; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm2, (%esp)
+; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    shlb $2, %cl
+; X86-SSE42-NEXT:    andb $28, %cl
+; X86-SSE42-NEXT:    negb %cl
+; X86-SSE42-NEXT:    movsbl %cl, %ecx
+; X86-SSE42-NEXT:    movups 32(%esp,%ecx), %xmm0
+; X86-SSE42-NEXT:    movups 48(%esp,%ecx), %xmm1
+; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
+; X86-SSE42-NEXT:    movups %xmm0, (%eax)
+; X86-SSE42-NEXT:    addl $76, %esp
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX-LABEL: shl_32bytes_dwordOff:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    subl $76, %esp
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT:    vmovups (%edx), %ymm0
+; X86-AVX-NEXT:    movzbl (%ecx), %ecx
+; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX-NEXT:    vmovups %ymm1, (%esp)
+; X86-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    shlb $2, %cl
+; X86-AVX-NEXT:    andb $28, %cl
+; X86-AVX-NEXT:    negb %cl
+; X86-AVX-NEXT:    movsbl %cl, %ecx
+; X86-AVX-NEXT:    vmovups 32(%esp,%ecx), %xmm0
+; X86-AVX-NEXT:    vmovups 48(%esp,%ecx), %xmm1
+; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
+; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
+; X86-AVX-NEXT:    addl $76, %esp
+; X86-AVX-NEXT:    vzeroupper
+; X86-AVX-NEXT:    retl
+  %src = load i256, ptr %src.ptr, align 1
+  %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+  %bitOff = shl i256 %dwordOff, 5
+  %res = shl i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @shl_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
+; X64-SSE2-LABEL: shl_32bytes_qwordOff:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movq (%rdi), %rax
+; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
+; X64-SSE2-NEXT:    movq 16(%rdi), %r8
+; X64-SSE2-NEXT:    movq 24(%rdi), %rdi
+; X64-SSE2-NEXT:    movzbl (%rsi), %esi
+; X64-SSE2-NEXT:    xorps %xmm0, %xmm0
+; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    shlb $3, %sil
+; X64-SSE2-NEXT:    andb $24, %sil
+; X64-SSE2-NEXT:    negb %sil
+; X64-SSE2-NEXT:    movsbq %sil, %rax
+; X64-SSE2-NEXT:    movq -40(%rsp,%rax), %rcx
+; X64-SSE2-NEXT:    movq -32(%rsp,%rax), %rsi
+; X64-SSE2-NEXT:    movq -16(%rsp,%rax), %rdi
+; X64-SSE2-NEXT:    movq -24(%rsp,%rax), %rax
+; X64-SSE2-NEXT:    movq %rax, 16(%rdx)
+; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
+; X64-SSE2-NEXT:    movq %rcx, (%rdx)
+; X64-SSE2-NEXT:    movq %rsi, 8(%rdx)
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: shl_32bytes_qwordOff:
+; X64-SSE42:       # %bb.0:
+; X64-SSE42-NEXT:    movups (%rdi), %xmm0
+; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
+; X64-SSE42-NEXT:    movzbl (%rsi), %eax
+; X64-SSE42-NEXT:    xorps %xmm2, %xmm2
+; X64-SSE42-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    shlb $3, %al
+; X64-SSE42-NEXT:    andb $24, %al
+; X64-SSE42-NEXT:    negb %al
+; X64-SSE42-NEXT:    movsbq %al, %rax
+; X64-SSE42-NEXT:    movups -40(%rsp,%rax), %xmm0
+; X64-SSE42-NEXT:    movups -24(%rsp,%rax), %xmm1
+; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
+; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX-LABEL: shl_32bytes_qwordOff:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
+; X64-AVX-NEXT:    movzbl (%rsi), %eax
+; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    shlb $3, %al
+; X64-AVX-NEXT:    andb $24, %al
+; X64-AVX-NEXT:    negb %al
+; X64-AVX-NEXT:    movsbq %al, %rax
+; X64-AVX-NEXT:    vmovups -40(%rsp,%rax), %xmm0
+; X64-AVX-NEXT:    vmovups -24(%rsp,%rax), %xmm1
+; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
+; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX-NEXT:    vzeroupper
+; X64-AVX-NEXT:    retq
+;
+; X86-SSE2-LABEL: shl_32bytes_qwordOff:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    pushl %ebx
+; X86-SSE2-NEXT:    pushl %edi
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    subl $92, %esp
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-SSE2-NEXT:    movl (%ebp), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 4(%ebp), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 8(%ebp), %esi
+; X86-SSE2-NEXT:    movl 12(%ebp), %edi
+; X86-SSE2-NEXT:    movl 16(%ebp), %ebx
+; X86-SSE2-NEXT:    movzbl (%ecx), %ecx
+; X86-SSE2-NEXT:    movl 20(%ebp), %edx
+; X86-SSE2-NEXT:    movl 24(%ebp), %eax
+; X86-SSE2-NEXT:    movl 28(%ebp), %ebp
+; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    shlb $3, %cl
+; X86-SSE2-NEXT:    andb $24, %cl
+; X86-SSE2-NEXT:    negb %cl
+; X86-SSE2-NEXT:    movsbl %cl, %edx
+; X86-SSE2-NEXT:    movl 48(%esp,%edx), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 52(%esp,%edx), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 60(%esp,%edx), %esi
+; X86-SSE2-NEXT:    movl 56(%esp,%edx), %edi
+; X86-SSE2-NEXT:    movl 68(%esp,%edx), %ebx
+; X86-SSE2-NEXT:    movl 64(%esp,%edx), %ebp
+; X86-SSE2-NEXT:    movl 76(%esp,%edx), %ecx
+; X86-SSE2-NEXT:    movl 72(%esp,%edx), %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl %edx, 24(%eax)
+; X86-SSE2-NEXT:    movl %ecx, 28(%eax)
+; X86-SSE2-NEXT:    movl %ebp, 16(%eax)
+; X86-SSE2-NEXT:    movl %ebx, 20(%eax)
+; X86-SSE2-NEXT:    movl %edi, 8(%eax)
+; X86-SSE2-NEXT:    movl %esi, 12(%eax)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl %ecx, (%eax)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
+; X86-SSE2-NEXT:    addl $92, %esp
+; X86-SSE2-NEXT:    popl %esi
+; X86-SSE2-NEXT:    popl %edi
+; X86-SSE2-NEXT:    popl %ebx
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: shl_32bytes_qwordOff:
+; X86-SSE42:       # %bb.0:
+; X86-SSE42-NEXT:    subl $76, %esp
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1387,50 +8760,3063 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
 ; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
 ; X86-SSE42-NEXT:    xorps %xmm2, %xmm2
-; X86-SSE42-NEXT:    movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm2, (%esp)
-; X86-SSE42-NEXT:    movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    andb $31, %cl
+; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm2, (%esp)
+; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    shlb $3, %cl
+; X86-SSE42-NEXT:    andb $24, %cl
 ; X86-SSE42-NEXT:    negb %cl
 ; X86-SSE42-NEXT:    movsbl %cl, %ecx
 ; X86-SSE42-NEXT:    movups 32(%esp,%ecx), %xmm0
 ; X86-SSE42-NEXT:    movups 48(%esp,%ecx), %xmm1
 ; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
 ; X86-SSE42-NEXT:    movups %xmm0, (%eax)
+; X86-SSE42-NEXT:    addl $76, %esp
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX-LABEL: shl_32bytes_qwordOff:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    subl $76, %esp
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT:    vmovups (%edx), %ymm0
+; X86-AVX-NEXT:    movzbl (%ecx), %ecx
+; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX-NEXT:    vmovups %ymm1, (%esp)
+; X86-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    shlb $3, %cl
+; X86-AVX-NEXT:    andb $24, %cl
+; X86-AVX-NEXT:    negb %cl
+; X86-AVX-NEXT:    movsbl %cl, %ecx
+; X86-AVX-NEXT:    vmovups 32(%esp,%ecx), %xmm0
+; X86-AVX-NEXT:    vmovups 48(%esp,%ecx), %xmm1
+; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
+; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
+; X86-AVX-NEXT:    addl $76, %esp
+; X86-AVX-NEXT:    vzeroupper
+; X86-AVX-NEXT:    retl
+  %src = load i256, ptr %src.ptr, align 1
+  %qwordOff = load i256, ptr %qwordOff.ptr, align 1
+  %bitOff = shl i256 %qwordOff, 6
+  %res = shl i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; FALLBACK0-LABEL: ashr_32bytes:
+; FALLBACK0:       # %bb.0:
+; FALLBACK0-NEXT:    pushq %rbx
+; FALLBACK0-NEXT:    movq (%rdi), %rcx
+; FALLBACK0-NEXT:    movq 8(%rdi), %r8
+; FALLBACK0-NEXT:    movq 16(%rdi), %r9
+; FALLBACK0-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK0-NEXT:    movzbl (%rsi), %esi
+; FALLBACK0-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    sarq $63, %rdi
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    andb $24, %sil
+; FALLBACK0-NEXT:    movzbl %sil, %r9d
+; FALLBACK0-NEXT:    movq -64(%rsp,%r9), %r10
+; FALLBACK0-NEXT:    movq -56(%rsp,%r9), %rdi
+; FALLBACK0-NEXT:    movq %rdi, %r11
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r11
+; FALLBACK0-NEXT:    movl %eax, %esi
+; FALLBACK0-NEXT:    notb %sil
+; FALLBACK0-NEXT:    movq -48(%rsp,%r9), %rbx
+; FALLBACK0-NEXT:    leaq (%rbx,%rbx), %r8
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r8
+; FALLBACK0-NEXT:    orq %r11, %r8
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r10
+; FALLBACK0-NEXT:    addq %rdi, %rdi
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %rdi
+; FALLBACK0-NEXT:    orq %r10, %rdi
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %rbx
+; FALLBACK0-NEXT:    movq -40(%rsp,%r9), %r9
+; FALLBACK0-NEXT:    leaq (%r9,%r9), %r10
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r10
+; FALLBACK0-NEXT:    orq %rbx, %r10
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    sarq %cl, %r9
+; FALLBACK0-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK0-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK0-NEXT:    movq %rdi, (%rdx)
+; FALLBACK0-NEXT:    movq %r8, 8(%rdx)
+; FALLBACK0-NEXT:    popq %rbx
+; FALLBACK0-NEXT:    retq
+;
+; FALLBACK1-LABEL: ashr_32bytes:
+; FALLBACK1:       # %bb.0:
+; FALLBACK1-NEXT:    movq (%rdi), %rax
+; FALLBACK1-NEXT:    movq 8(%rdi), %r8
+; FALLBACK1-NEXT:    movq 16(%rdi), %r9
+; FALLBACK1-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK1-NEXT:    movzbl (%rsi), %esi
+; FALLBACK1-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    sarq $63, %rdi
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    andb $24, %sil
+; FALLBACK1-NEXT:    movzbl %sil, %eax
+; FALLBACK1-NEXT:    movq -56(%rsp,%rax), %rsi
+; FALLBACK1-NEXT:    movq -72(%rsp,%rax), %rdi
+; FALLBACK1-NEXT:    movq -64(%rsp,%rax), %r8
+; FALLBACK1-NEXT:    movq %r8, %r9
+; FALLBACK1-NEXT:    shrdq %cl, %rsi, %r9
+; FALLBACK1-NEXT:    movq -48(%rsp,%rax), %rax
+; FALLBACK1-NEXT:    shrdq %cl, %rax, %rsi
+; FALLBACK1-NEXT:    shrdq %cl, %r8, %rdi
+; FALLBACK1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT:    sarq %cl, %rax
+; FALLBACK1-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK1-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK1-NEXT:    movq %rdi, (%rdx)
+; FALLBACK1-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK1-NEXT:    retq
+;
+; FALLBACK2-LABEL: ashr_32bytes:
+; FALLBACK2:       # %bb.0:
+; FALLBACK2-NEXT:    movq (%rdi), %rcx
+; FALLBACK2-NEXT:    movq 8(%rdi), %r8
+; FALLBACK2-NEXT:    movq 16(%rdi), %r9
+; FALLBACK2-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK2-NEXT:    movzbl (%rsi), %esi
+; FALLBACK2-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    sarq $63, %rdi
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    andb $24, %sil
+; FALLBACK2-NEXT:    movzbl %sil, %ecx
+; FALLBACK2-NEXT:    movq -64(%rsp,%rcx), %rsi
+; FALLBACK2-NEXT:    movq -56(%rsp,%rcx), %rdi
+; FALLBACK2-NEXT:    shrxq %rax, %rsi, %r8
+; FALLBACK2-NEXT:    shrxq %rax, -72(%rsp,%rcx), %r9
+; FALLBACK2-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK2-NEXT:    movq -48(%rsp,%rcx), %rcx
+; FALLBACK2-NEXT:    sarxq %rax, %rcx, %r11
+; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    addq %rdi, %rdi
+; FALLBACK2-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK2-NEXT:    orq %r8, %rdi
+; FALLBACK2-NEXT:    addq %rsi, %rsi
+; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT:    orq %r9, %rsi
+; FALLBACK2-NEXT:    addq %rcx, %rcx
+; FALLBACK2-NEXT:    shlxq %rax, %rcx, %rax
+; FALLBACK2-NEXT:    orq %r10, %rax
+; FALLBACK2-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
+; FALLBACK2-NEXT:    movq %rsi, (%rdx)
+; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT:    retq
+;
+; FALLBACK3-LABEL: ashr_32bytes:
+; FALLBACK3:       # %bb.0:
+; FALLBACK3-NEXT:    movq (%rdi), %rax
+; FALLBACK3-NEXT:    movq 8(%rdi), %r8
+; FALLBACK3-NEXT:    movq 16(%rdi), %r9
+; FALLBACK3-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK3-NEXT:    movzbl (%rsi), %esi
+; FALLBACK3-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    sarq $63, %rdi
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    andb $24, %sil
+; FALLBACK3-NEXT:    movzbl %sil, %eax
+; FALLBACK3-NEXT:    movq -56(%rsp,%rax), %rsi
+; FALLBACK3-NEXT:    movq -72(%rsp,%rax), %rdi
+; FALLBACK3-NEXT:    movq -64(%rsp,%rax), %r8
+; FALLBACK3-NEXT:    movq %r8, %r9
+; FALLBACK3-NEXT:    shrdq %cl, %rsi, %r9
+; FALLBACK3-NEXT:    movq -48(%rsp,%rax), %rax
+; FALLBACK3-NEXT:    shrdq %cl, %rax, %rsi
+; FALLBACK3-NEXT:    shrdq %cl, %r8, %rdi
+; FALLBACK3-NEXT:    sarxq %rcx, %rax, %rax
+; FALLBACK3-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK3-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK3-NEXT:    movq %rdi, (%rdx)
+; FALLBACK3-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK3-NEXT:    retq
+;
+; FALLBACK4-LABEL: ashr_32bytes:
+; FALLBACK4:       # %bb.0:
+; FALLBACK4-NEXT:    pushq %rbx
+; FALLBACK4-NEXT:    movups (%rdi), %xmm0
+; FALLBACK4-NEXT:    movq 16(%rdi), %rcx
+; FALLBACK4-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK4-NEXT:    movzbl (%rsi), %esi
+; FALLBACK4-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    sarq $63, %rdi
+; FALLBACK4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    andb $24, %sil
+; FALLBACK4-NEXT:    movzbl %sil, %r9d
+; FALLBACK4-NEXT:    movq -64(%rsp,%r9), %r10
+; FALLBACK4-NEXT:    movq -56(%rsp,%r9), %r8
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r10
+; FALLBACK4-NEXT:    movl %eax, %esi
+; FALLBACK4-NEXT:    notb %sil
+; FALLBACK4-NEXT:    leaq (%r8,%r8), %rdi
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %rdi
+; FALLBACK4-NEXT:    orq %r10, %rdi
+; FALLBACK4-NEXT:    movq -48(%rsp,%r9), %r10
+; FALLBACK4-NEXT:    movq %r10, %r11
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r11
+; FALLBACK4-NEXT:    movq -40(%rsp,%r9), %r9
+; FALLBACK4-NEXT:    leaq (%r9,%r9), %rbx
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %rbx
+; FALLBACK4-NEXT:    orq %r11, %rbx
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r8
+; FALLBACK4-NEXT:    addq %r10, %r10
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r10
+; FALLBACK4-NEXT:    orq %r8, %r10
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    sarq %cl, %r9
+; FALLBACK4-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK4-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK4-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK4-NEXT:    movq %rdi, (%rdx)
+; FALLBACK4-NEXT:    popq %rbx
+; FALLBACK4-NEXT:    retq
+;
+; FALLBACK5-LABEL: ashr_32bytes:
+; FALLBACK5:       # %bb.0:
+; FALLBACK5-NEXT:    movups (%rdi), %xmm0
+; FALLBACK5-NEXT:    movq 16(%rdi), %rax
+; FALLBACK5-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK5-NEXT:    movzbl (%rsi), %esi
+; FALLBACK5-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    sarq $63, %rdi
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    andb $24, %sil
+; FALLBACK5-NEXT:    movzbl %sil, %eax
+; FALLBACK5-NEXT:    movq -48(%rsp,%rax), %rsi
+; FALLBACK5-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK5-NEXT:    movq %rdi, %r8
+; FALLBACK5-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK5-NEXT:    movq -72(%rsp,%rax), %r9
+; FALLBACK5-NEXT:    movq -64(%rsp,%rax), %rax
+; FALLBACK5-NEXT:    movq %rax, %r10
+; FALLBACK5-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK5-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK5-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT:    sarq %cl, %rsi
+; FALLBACK5-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK5-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK5-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT:    movq %r9, (%rdx)
+; FALLBACK5-NEXT:    retq
+;
+; FALLBACK6-LABEL: ashr_32bytes:
+; FALLBACK6:       # %bb.0:
+; FALLBACK6-NEXT:    movups (%rdi), %xmm0
+; FALLBACK6-NEXT:    movq 16(%rdi), %rcx
+; FALLBACK6-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK6-NEXT:    movzbl (%rsi), %esi
+; FALLBACK6-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    sarq $63, %rdi
+; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    andb $24, %sil
+; FALLBACK6-NEXT:    movzbl %sil, %ecx
+; FALLBACK6-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK6-NEXT:    movq -64(%rsp,%rcx), %rdi
+; FALLBACK6-NEXT:    movq -56(%rsp,%rcx), %r8
+; FALLBACK6-NEXT:    shrxq %rax, %r8, %r9
+; FALLBACK6-NEXT:    movq -48(%rsp,%rcx), %rcx
+; FALLBACK6-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK6-NEXT:    sarxq %rax, %rcx, %r11
+; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT:    notb %al
+; FALLBACK6-NEXT:    addq %rdi, %rdi
+; FALLBACK6-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT:    orq %rsi, %rdi
+; FALLBACK6-NEXT:    addq %rcx, %rcx
+; FALLBACK6-NEXT:    shlxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT:    orq %r9, %rcx
+; FALLBACK6-NEXT:    addq %r8, %r8
+; FALLBACK6-NEXT:    shlxq %rax, %r8, %rax
+; FALLBACK6-NEXT:    orq %r10, %rax
+; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT:    movq %rdi, (%rdx)
+; FALLBACK6-NEXT:    retq
+;
+; FALLBACK7-LABEL: ashr_32bytes:
+; FALLBACK7:       # %bb.0:
+; FALLBACK7-NEXT:    movups (%rdi), %xmm0
+; FALLBACK7-NEXT:    movq 16(%rdi), %rax
+; FALLBACK7-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK7-NEXT:    movzbl (%rsi), %esi
+; FALLBACK7-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    sarq $63, %rdi
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    andb $24, %sil
+; FALLBACK7-NEXT:    movzbl %sil, %eax
+; FALLBACK7-NEXT:    movq -48(%rsp,%rax), %rsi
+; FALLBACK7-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK7-NEXT:    movq %rdi, %r8
+; FALLBACK7-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK7-NEXT:    movq -72(%rsp,%rax), %r9
+; FALLBACK7-NEXT:    movq -64(%rsp,%rax), %rax
+; FALLBACK7-NEXT:    movq %rax, %r10
+; FALLBACK7-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK7-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK7-NEXT:    sarxq %rcx, %rsi, %rax
+; FALLBACK7-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK7-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK7-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK7-NEXT:    movq %r9, (%rdx)
+; FALLBACK7-NEXT:    retq
+;
+; FALLBACK8-LABEL: ashr_32bytes:
+; FALLBACK8:       # %bb.0:
+; FALLBACK8-NEXT:    pushq %rbx
+; FALLBACK8-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK8-NEXT:    movq 16(%rdi), %rcx
+; FALLBACK8-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK8-NEXT:    movzbl (%rsi), %esi
+; FALLBACK8-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK8-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    sarq $63, %rdi
+; FALLBACK8-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    andb $24, %sil
+; FALLBACK8-NEXT:    movzbl %sil, %r9d
+; FALLBACK8-NEXT:    movq -64(%rsp,%r9), %r10
+; FALLBACK8-NEXT:    movq -56(%rsp,%r9), %r8
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r10
+; FALLBACK8-NEXT:    movl %eax, %esi
+; FALLBACK8-NEXT:    notb %sil
+; FALLBACK8-NEXT:    leaq (%r8,%r8), %rdi
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %rdi
+; FALLBACK8-NEXT:    orq %r10, %rdi
+; FALLBACK8-NEXT:    movq -48(%rsp,%r9), %r10
+; FALLBACK8-NEXT:    movq %r10, %r11
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r11
+; FALLBACK8-NEXT:    movq -40(%rsp,%r9), %r9
+; FALLBACK8-NEXT:    leaq (%r9,%r9), %rbx
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %rbx
+; FALLBACK8-NEXT:    orq %r11, %rbx
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r8
+; FALLBACK8-NEXT:    addq %r10, %r10
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r10
+; FALLBACK8-NEXT:    orq %r8, %r10
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    sarq %cl, %r9
+; FALLBACK8-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK8-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK8-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK8-NEXT:    movq %rdi, (%rdx)
+; FALLBACK8-NEXT:    popq %rbx
+; FALLBACK8-NEXT:    retq
+;
+; FALLBACK9-LABEL: ashr_32bytes:
+; FALLBACK9:       # %bb.0:
+; FALLBACK9-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK9-NEXT:    movq 16(%rdi), %rax
+; FALLBACK9-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK9-NEXT:    movzbl (%rsi), %esi
+; FALLBACK9-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    sarq $63, %rdi
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    andb $24, %sil
+; FALLBACK9-NEXT:    movzbl %sil, %eax
+; FALLBACK9-NEXT:    movq -48(%rsp,%rax), %rsi
+; FALLBACK9-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK9-NEXT:    movq %rdi, %r8
+; FALLBACK9-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK9-NEXT:    movq -72(%rsp,%rax), %r9
+; FALLBACK9-NEXT:    movq -64(%rsp,%rax), %rax
+; FALLBACK9-NEXT:    movq %rax, %r10
+; FALLBACK9-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK9-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK9-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT:    sarq %cl, %rsi
+; FALLBACK9-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK9-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK9-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT:    movq %r9, (%rdx)
+; FALLBACK9-NEXT:    retq
+;
+; FALLBACK10-LABEL: ashr_32bytes:
+; FALLBACK10:       # %bb.0:
+; FALLBACK10-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK10-NEXT:    movq 16(%rdi), %rcx
+; FALLBACK10-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK10-NEXT:    movzbl (%rsi), %esi
+; FALLBACK10-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    sarq $63, %rdi
+; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    andb $24, %sil
+; FALLBACK10-NEXT:    movzbl %sil, %ecx
+; FALLBACK10-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK10-NEXT:    movq -64(%rsp,%rcx), %rdi
+; FALLBACK10-NEXT:    movq -56(%rsp,%rcx), %r8
+; FALLBACK10-NEXT:    shrxq %rax, %r8, %r9
+; FALLBACK10-NEXT:    movq -48(%rsp,%rcx), %rcx
+; FALLBACK10-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK10-NEXT:    sarxq %rax, %rcx, %r11
+; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT:    notb %al
+; FALLBACK10-NEXT:    addq %rdi, %rdi
+; FALLBACK10-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT:    orq %rsi, %rdi
+; FALLBACK10-NEXT:    addq %rcx, %rcx
+; FALLBACK10-NEXT:    shlxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT:    orq %r9, %rcx
+; FALLBACK10-NEXT:    addq %r8, %r8
+; FALLBACK10-NEXT:    shlxq %rax, %r8, %rax
+; FALLBACK10-NEXT:    orq %r10, %rax
+; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT:    movq %rdi, (%rdx)
+; FALLBACK10-NEXT:    retq
+;
+; FALLBACK11-LABEL: ashr_32bytes:
+; FALLBACK11:       # %bb.0:
+; FALLBACK11-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK11-NEXT:    movq 16(%rdi), %rax
+; FALLBACK11-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK11-NEXT:    movzbl (%rsi), %esi
+; FALLBACK11-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    sarq $63, %rdi
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    andb $24, %sil
+; FALLBACK11-NEXT:    movzbl %sil, %eax
+; FALLBACK11-NEXT:    movq -48(%rsp,%rax), %rsi
+; FALLBACK11-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK11-NEXT:    movq %rdi, %r8
+; FALLBACK11-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK11-NEXT:    movq -72(%rsp,%rax), %r9
+; FALLBACK11-NEXT:    movq -64(%rsp,%rax), %rax
+; FALLBACK11-NEXT:    movq %rax, %r10
+; FALLBACK11-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK11-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK11-NEXT:    sarxq %rcx, %rsi, %rax
+; FALLBACK11-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK11-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK11-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK11-NEXT:    movq %r9, (%rdx)
+; FALLBACK11-NEXT:    retq
+;
+; FALLBACK12-LABEL: ashr_32bytes:
+; FALLBACK12:       # %bb.0:
+; FALLBACK12-NEXT:    pushq %rbx
+; FALLBACK12-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK12-NEXT:    movq 16(%rdi), %rcx
+; FALLBACK12-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK12-NEXT:    movzbl (%rsi), %esi
+; FALLBACK12-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK12-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    sarq $63, %rdi
+; FALLBACK12-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    andb $24, %sil
+; FALLBACK12-NEXT:    movzbl %sil, %r9d
+; FALLBACK12-NEXT:    movq -64(%rsp,%r9), %r10
+; FALLBACK12-NEXT:    movq -56(%rsp,%r9), %r8
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r10
+; FALLBACK12-NEXT:    movl %eax, %esi
+; FALLBACK12-NEXT:    notb %sil
+; FALLBACK12-NEXT:    leaq (%r8,%r8), %rdi
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %rdi
+; FALLBACK12-NEXT:    orq %r10, %rdi
+; FALLBACK12-NEXT:    movq -48(%rsp,%r9), %r10
+; FALLBACK12-NEXT:    movq %r10, %r11
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r11
+; FALLBACK12-NEXT:    movq -40(%rsp,%r9), %r9
+; FALLBACK12-NEXT:    leaq (%r9,%r9), %rbx
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %rbx
+; FALLBACK12-NEXT:    orq %r11, %rbx
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r8
+; FALLBACK12-NEXT:    addq %r10, %r10
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r10
+; FALLBACK12-NEXT:    orq %r8, %r10
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    sarq %cl, %r9
+; FALLBACK12-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK12-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK12-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK12-NEXT:    movq %rdi, (%rdx)
+; FALLBACK12-NEXT:    popq %rbx
+; FALLBACK12-NEXT:    retq
+;
+; FALLBACK13-LABEL: ashr_32bytes:
+; FALLBACK13:       # %bb.0:
+; FALLBACK13-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK13-NEXT:    movq 16(%rdi), %rax
+; FALLBACK13-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK13-NEXT:    movzbl (%rsi), %esi
+; FALLBACK13-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    sarq $63, %rdi
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    andb $24, %sil
+; FALLBACK13-NEXT:    movzbl %sil, %eax
+; FALLBACK13-NEXT:    movq -48(%rsp,%rax), %rsi
+; FALLBACK13-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK13-NEXT:    movq %rdi, %r8
+; FALLBACK13-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK13-NEXT:    movq -72(%rsp,%rax), %r9
+; FALLBACK13-NEXT:    movq -64(%rsp,%rax), %rax
+; FALLBACK13-NEXT:    movq %rax, %r10
+; FALLBACK13-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK13-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK13-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT:    sarq %cl, %rsi
+; FALLBACK13-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK13-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK13-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK13-NEXT:    movq %r9, (%rdx)
+; FALLBACK13-NEXT:    retq
+;
+; FALLBACK14-LABEL: ashr_32bytes:
+; FALLBACK14:       # %bb.0:
+; FALLBACK14-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK14-NEXT:    movq 16(%rdi), %rcx
+; FALLBACK14-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK14-NEXT:    movzbl (%rsi), %esi
+; FALLBACK14-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    sarq $63, %rdi
+; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    andb $24, %sil
+; FALLBACK14-NEXT:    movzbl %sil, %ecx
+; FALLBACK14-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK14-NEXT:    movq -64(%rsp,%rcx), %rdi
+; FALLBACK14-NEXT:    movq -56(%rsp,%rcx), %r8
+; FALLBACK14-NEXT:    shrxq %rax, %r8, %r9
+; FALLBACK14-NEXT:    movq -48(%rsp,%rcx), %rcx
+; FALLBACK14-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK14-NEXT:    sarxq %rax, %rcx, %r11
+; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT:    notb %al
+; FALLBACK14-NEXT:    addq %rdi, %rdi
+; FALLBACK14-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT:    orq %rsi, %rdi
+; FALLBACK14-NEXT:    addq %rcx, %rcx
+; FALLBACK14-NEXT:    shlxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT:    orq %r9, %rcx
+; FALLBACK14-NEXT:    addq %r8, %r8
+; FALLBACK14-NEXT:    shlxq %rax, %r8, %rax
+; FALLBACK14-NEXT:    orq %r10, %rax
+; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT:    movq %rdi, (%rdx)
+; FALLBACK14-NEXT:    retq
+;
+; FALLBACK15-LABEL: ashr_32bytes:
+; FALLBACK15:       # %bb.0:
+; FALLBACK15-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK15-NEXT:    movq 16(%rdi), %rax
+; FALLBACK15-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK15-NEXT:    movzbl (%rsi), %esi
+; FALLBACK15-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    sarq $63, %rdi
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    andb $24, %sil
+; FALLBACK15-NEXT:    movzbl %sil, %eax
+; FALLBACK15-NEXT:    movq -48(%rsp,%rax), %rsi
+; FALLBACK15-NEXT:    movq -56(%rsp,%rax), %rdi
+; FALLBACK15-NEXT:    movq %rdi, %r8
+; FALLBACK15-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK15-NEXT:    movq -72(%rsp,%rax), %r9
+; FALLBACK15-NEXT:    movq -64(%rsp,%rax), %rax
+; FALLBACK15-NEXT:    movq %rax, %r10
+; FALLBACK15-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK15-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK15-NEXT:    sarxq %rcx, %rsi, %rax
+; FALLBACK15-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK15-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK15-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK15-NEXT:    movq %r9, (%rdx)
+; FALLBACK15-NEXT:    retq
+;
+; FALLBACK16-LABEL: ashr_32bytes:
+; FALLBACK16:       # %bb.0:
+; FALLBACK16-NEXT:    pushl %ebp
+; FALLBACK16-NEXT:    pushl %ebx
+; FALLBACK16-NEXT:    pushl %edi
+; FALLBACK16-NEXT:    pushl %esi
+; FALLBACK16-NEXT:    subl $108, %esp
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK16-NEXT:    movl (%esi), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 4(%esi), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 8(%esi), %ebx
+; FALLBACK16-NEXT:    movl 12(%esi), %ebp
+; FALLBACK16-NEXT:    movl 16(%esi), %edi
+; FALLBACK16-NEXT:    movzbl (%eax), %ecx
+; FALLBACK16-NEXT:    movl 20(%esi), %edx
+; FALLBACK16-NEXT:    movl 24(%esi), %eax
+; FALLBACK16-NEXT:    movl 28(%esi), %esi
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, %edx
+; FALLBACK16-NEXT:    shlb $3, %dl
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    sarl $31, %esi
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    andb $28, %cl
+; FALLBACK16-NEXT:    movzbl %cl, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 32(%esp,%edi), %esi
+; FALLBACK16-NEXT:    movl 36(%esp,%edi), %eax
+; FALLBACK16-NEXT:    movl %eax, %ebx
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    movb %dl, %ch
+; FALLBACK16-NEXT:    notb %ch
+; FALLBACK16-NEXT:    movl 40(%esp,%edi), %edi
+; FALLBACK16-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %ebx, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    addl %eax, %eax
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    orl %esi, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl 44(%esp,%eax), %ebp
+; FALLBACK16-NEXT:    movl %ebp, %esi
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    movl %edx, %ebx
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    movl 48(%esp,%eax), %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    orl %esi, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %ebx, %edx
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    addl %ebp, %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %edi, %ebp
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK16-NEXT:    movl 52(%esp,%esi), %edi
+; FALLBACK16-NEXT:    movl %edi, %eax
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl 56(%esp,%esi), %ebx
+; FALLBACK16-NEXT:    leal (%ebx,%ebx), %esi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %eax, %esi
+; FALLBACK16-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    addl %edi, %edi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl %eax, %edi
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl 60(%esp,%eax), %eax
+; FALLBACK16-NEXT:    leal (%eax,%eax), %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    orl %ebx, %edx
+; FALLBACK16-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; FALLBACK16-NEXT:    sarl %cl, %eax
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK16-NEXT:    movl %edx, 24(%ecx)
+; FALLBACK16-NEXT:    movl %edi, 16(%ecx)
+; FALLBACK16-NEXT:    movl %esi, 20(%ecx)
+; FALLBACK16-NEXT:    movl %ebp, 8(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, (%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 4(%ecx)
+; FALLBACK16-NEXT:    addl $108, %esp
+; FALLBACK16-NEXT:    popl %esi
+; FALLBACK16-NEXT:    popl %edi
+; FALLBACK16-NEXT:    popl %ebx
+; FALLBACK16-NEXT:    popl %ebp
+; FALLBACK16-NEXT:    retl
+;
+; FALLBACK17-LABEL: ashr_32bytes:
+; FALLBACK17:       # %bb.0:
+; FALLBACK17-NEXT:    pushl %ebp
+; FALLBACK17-NEXT:    pushl %ebx
+; FALLBACK17-NEXT:    pushl %edi
+; FALLBACK17-NEXT:    pushl %esi
+; FALLBACK17-NEXT:    subl $92, %esp
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT:    movl (%ecx), %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 4(%ecx), %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 8(%ecx), %edx
+; FALLBACK17-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 12(%ecx), %ebp
+; FALLBACK17-NEXT:    movl 16(%ecx), %ebx
+; FALLBACK17-NEXT:    movzbl (%eax), %eax
+; FALLBACK17-NEXT:    movl 20(%ecx), %edi
+; FALLBACK17-NEXT:    movl 24(%ecx), %edx
+; FALLBACK17-NEXT:    movl 28(%ecx), %esi
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, %ecx
+; FALLBACK17-NEXT:    shlb $3, %cl
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl (%esp), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    sarl $31, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    andb $28, %al
+; FALLBACK17-NEXT:    movzbl %al, %ebp
+; FALLBACK17-NEXT:    movl 24(%esp,%ebp), %edx
+; FALLBACK17-NEXT:    movl 20(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 32(%esp,%ebp), %ebx
+; FALLBACK17-NEXT:    movl 28(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %esi
+; FALLBACK17-NEXT:    shrdl %cl, %ebx, %esi
+; FALLBACK17-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 40(%esp,%ebp), %edx
+; FALLBACK17-NEXT:    movl 36(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edi
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK17-NEXT:    movl 16(%esp,%ebp), %esi
+; FALLBACK17-NEXT:    movl 44(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT:    movl %edx, 24(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK17-NEXT:    sarl %cl, %eax
+; FALLBACK17-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK17-NEXT:    movl %ebx, 16(%ebp)
+; FALLBACK17-NEXT:    movl %edi, 20(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK17-NEXT:    movl %esi, (%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK17-NEXT:    addl $92, %esp
+; FALLBACK17-NEXT:    popl %esi
+; FALLBACK17-NEXT:    popl %edi
+; FALLBACK17-NEXT:    popl %ebx
+; FALLBACK17-NEXT:    popl %ebp
+; FALLBACK17-NEXT:    retl
+;
+; FALLBACK18-LABEL: ashr_32bytes:
+; FALLBACK18:       # %bb.0:
+; FALLBACK18-NEXT:    pushl %ebp
+; FALLBACK18-NEXT:    pushl %ebx
+; FALLBACK18-NEXT:    pushl %edi
+; FALLBACK18-NEXT:    pushl %esi
+; FALLBACK18-NEXT:    subl $108, %esp
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK18-NEXT:    movl (%esi), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 4(%esi), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 8(%esi), %ebx
+; FALLBACK18-NEXT:    movl 12(%esi), %ebp
+; FALLBACK18-NEXT:    movl 16(%esi), %edi
+; FALLBACK18-NEXT:    movzbl (%ecx), %ecx
+; FALLBACK18-NEXT:    movl 20(%esi), %edx
+; FALLBACK18-NEXT:    movl 24(%esi), %eax
+; FALLBACK18-NEXT:    movl 28(%esi), %esi
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, %eax
+; FALLBACK18-NEXT:    shlb $3, %al
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    sarl $31, %esi
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    andb $28, %cl
+; FALLBACK18-NEXT:    movzbl %cl, %edi
+; FALLBACK18-NEXT:    movl 36(%esp,%edi), %esi
+; FALLBACK18-NEXT:    movl 40(%esp,%edi), %ecx
+; FALLBACK18-NEXT:    shrxl %eax, %esi, %ebx
+; FALLBACK18-NEXT:    movl %eax, %edx
+; FALLBACK18-NEXT:    notb %dl
+; FALLBACK18-NEXT:    leal (%ecx,%ecx), %ebp
+; FALLBACK18-NEXT:    shlxl %edx, %ebp, %ebp
+; FALLBACK18-NEXT:    orl %ebx, %ebp
+; FALLBACK18-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %eax, 32(%esp,%edi), %ebx
+; FALLBACK18-NEXT:    addl %esi, %esi
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    orl %ebx, %esi
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 48(%esp,%edi), %esi
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    leal (%esi,%esi), %ebx
+; FALLBACK18-NEXT:    shlxl %edx, %ebx, %esi
+; FALLBACK18-NEXT:    movl 44(%esp,%edi), %ebp
+; FALLBACK18-NEXT:    shrxl %eax, %ebp, %ebx
+; FALLBACK18-NEXT:    orl %ebx, %esi
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %eax, %ecx, %ecx
+; FALLBACK18-NEXT:    movl %eax, %ebx
+; FALLBACK18-NEXT:    addl %ebp, %ebp
+; FALLBACK18-NEXT:    shlxl %edx, %ebp, %eax
+; FALLBACK18-NEXT:    orl %ecx, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 56(%esp,%edi), %ebp
+; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ecx
+; FALLBACK18-NEXT:    movl 52(%esp,%edi), %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %eax, %esi
+; FALLBACK18-NEXT:    orl %esi, %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    addl %eax, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %eax, %esi
+; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT:    movl 60(%esp,%edi), %edi
+; FALLBACK18-NEXT:    sarxl %ebx, %edi, %ebx
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %edx
+; FALLBACK18-NEXT:    orl %eax, %edx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK18-NEXT:    movl %edx, 24(%eax)
+; FALLBACK18-NEXT:    movl %esi, 16(%eax)
+; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, (%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK18-NEXT:    addl $108, %esp
+; FALLBACK18-NEXT:    popl %esi
+; FALLBACK18-NEXT:    popl %edi
+; FALLBACK18-NEXT:    popl %ebx
+; FALLBACK18-NEXT:    popl %ebp
+; FALLBACK18-NEXT:    retl
+;
+; FALLBACK19-LABEL: ashr_32bytes:
+; FALLBACK19:       # %bb.0:
+; FALLBACK19-NEXT:    pushl %ebp
+; FALLBACK19-NEXT:    pushl %ebx
+; FALLBACK19-NEXT:    pushl %edi
+; FALLBACK19-NEXT:    pushl %esi
+; FALLBACK19-NEXT:    subl $92, %esp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT:    movl (%ecx), %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 4(%ecx), %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 8(%ecx), %edx
+; FALLBACK19-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 12(%ecx), %ebp
+; FALLBACK19-NEXT:    movl 16(%ecx), %ebx
+; FALLBACK19-NEXT:    movzbl (%eax), %eax
+; FALLBACK19-NEXT:    movl 20(%ecx), %edi
+; FALLBACK19-NEXT:    movl 24(%ecx), %edx
+; FALLBACK19-NEXT:    movl 28(%ecx), %esi
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, %ecx
+; FALLBACK19-NEXT:    shlb $3, %cl
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl (%esp), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    sarl $31, %esi
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    andb $28, %al
+; FALLBACK19-NEXT:    movzbl %al, %ebp
+; FALLBACK19-NEXT:    movl 24(%esp,%ebp), %esi
+; FALLBACK19-NEXT:    movl 20(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %esi, %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 32(%esp,%ebp), %ebx
+; FALLBACK19-NEXT:    movl 28(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK19-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 40(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl 36(%esp,%ebp), %edx
+; FALLBACK19-NEXT:    movl %edx, %esi
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK19-NEXT:    movl 16(%esp,%ebp), %edx
+; FALLBACK19-NEXT:    movl 44(%esp,%ebp), %edi
+; FALLBACK19-NEXT:    shrdl %cl, %edi, %eax
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK19-NEXT:    sarxl %ecx, %edi, %eax
+; FALLBACK19-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK19-NEXT:    movl %ebx, 16(%ebp)
+; FALLBACK19-NEXT:    movl %esi, 20(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK19-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT:    movl %edx, (%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK19-NEXT:    addl $92, %esp
+; FALLBACK19-NEXT:    popl %esi
+; FALLBACK19-NEXT:    popl %edi
+; FALLBACK19-NEXT:    popl %ebx
+; FALLBACK19-NEXT:    popl %ebp
+; FALLBACK19-NEXT:    retl
+;
+; FALLBACK20-LABEL: ashr_32bytes:
+; FALLBACK20:       # %bb.0:
+; FALLBACK20-NEXT:    pushl %ebp
+; FALLBACK20-NEXT:    pushl %ebx
+; FALLBACK20-NEXT:    pushl %edi
+; FALLBACK20-NEXT:    pushl %esi
+; FALLBACK20-NEXT:    subl $108, %esp
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    movups (%ecx), %xmm0
+; FALLBACK20-NEXT:    movl 16(%ecx), %esi
+; FALLBACK20-NEXT:    movl 20(%ecx), %edi
+; FALLBACK20-NEXT:    movl 24(%ecx), %ebx
+; FALLBACK20-NEXT:    movl 28(%ecx), %edx
+; FALLBACK20-NEXT:    movzbl (%eax), %eax
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shlb $3, %cl
+; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    sarl $31, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    andb $28, %al
+; FALLBACK20-NEXT:    movzbl %al, %edi
+; FALLBACK20-NEXT:    movl 32(%esp,%edi), %eax
+; FALLBACK20-NEXT:    movl 36(%esp,%edi), %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    movl %ecx, %edx
+; FALLBACK20-NEXT:    movb %cl, %dh
+; FALLBACK20-NEXT:    notb %dl
+; FALLBACK20-NEXT:    addl %esi, %esi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl %eax, %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 44(%esp,%edi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %eax
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    movl 48(%esp,%edi), %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    addl %esi, %esi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl %eax, %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 40(%esp,%edi), %esi
+; FALLBACK20-NEXT:    movl %esi, %eax
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %eax, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 52(%esp,%edi), %ebp
+; FALLBACK20-NEXT:    movl %ebp, %eax
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    movl 56(%esp,%edi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %eax, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    addl %ebp, %ebp
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    orl %eax, %ebp
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    movl 60(%esp,%edi), %eax
+; FALLBACK20-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    orl %ebx, %edi
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    addl %esi, %esi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl %ebx, %esi
+; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    sarl %cl, %eax
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK20-NEXT:    movl %esi, 4(%ecx)
+; FALLBACK20-NEXT:    movl %edi, 24(%ecx)
+; FALLBACK20-NEXT:    movl %ebp, 16(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, (%ecx)
+; FALLBACK20-NEXT:    addl $108, %esp
+; FALLBACK20-NEXT:    popl %esi
+; FALLBACK20-NEXT:    popl %edi
+; FALLBACK20-NEXT:    popl %ebx
+; FALLBACK20-NEXT:    popl %ebp
+; FALLBACK20-NEXT:    retl
+;
+; FALLBACK21-LABEL: ashr_32bytes:
+; FALLBACK21:       # %bb.0:
+; FALLBACK21-NEXT:    pushl %ebp
+; FALLBACK21-NEXT:    pushl %ebx
+; FALLBACK21-NEXT:    pushl %edi
+; FALLBACK21-NEXT:    pushl %esi
+; FALLBACK21-NEXT:    subl $108, %esp
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT:    movups (%ecx), %xmm0
+; FALLBACK21-NEXT:    movl 16(%ecx), %esi
+; FALLBACK21-NEXT:    movl 20(%ecx), %edi
+; FALLBACK21-NEXT:    movl 24(%ecx), %ebx
+; FALLBACK21-NEXT:    movl 28(%ecx), %edx
+; FALLBACK21-NEXT:    movzbl (%eax), %eax
+; FALLBACK21-NEXT:    movl %eax, %ecx
+; FALLBACK21-NEXT:    shlb $3, %cl
+; FALLBACK21-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    sarl $31, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    andb $28, %al
+; FALLBACK21-NEXT:    movzbl %al, %ebp
+; FALLBACK21-NEXT:    movl 48(%esp,%ebp), %esi
+; FALLBACK21-NEXT:    movl 44(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 40(%esp,%ebp), %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 56(%esp,%ebp), %ebx
+; FALLBACK21-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK21-NEXT:    movl 32(%esp,%ebp), %edx
+; FALLBACK21-NEXT:    movl 36(%esp,%ebp), %edi
+; FALLBACK21-NEXT:    movl %edi, %esi
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK21-NEXT:    shrdl %cl, %ebp, %esi
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT:    movl %esi, 4(%ebp)
+; FALLBACK21-NEXT:    movl %ebx, 24(%ebp)
+; FALLBACK21-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK21-NEXT:    sarl %cl, %eax
+; FALLBACK21-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK21-NEXT:    movl %edx, (%ebp)
+; FALLBACK21-NEXT:    addl $108, %esp
+; FALLBACK21-NEXT:    popl %esi
+; FALLBACK21-NEXT:    popl %edi
+; FALLBACK21-NEXT:    popl %ebx
+; FALLBACK21-NEXT:    popl %ebp
+; FALLBACK21-NEXT:    retl
+;
+; FALLBACK22-LABEL: ashr_32bytes:
+; FALLBACK22:       # %bb.0:
+; FALLBACK22-NEXT:    pushl %ebp
+; FALLBACK22-NEXT:    pushl %ebx
+; FALLBACK22-NEXT:    pushl %edi
+; FALLBACK22-NEXT:    pushl %esi
+; FALLBACK22-NEXT:    subl $108, %esp
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT:    movups (%ecx), %xmm0
+; FALLBACK22-NEXT:    movl 16(%ecx), %esi
+; FALLBACK22-NEXT:    movl 20(%ecx), %edi
+; FALLBACK22-NEXT:    movl 24(%ecx), %ebx
+; FALLBACK22-NEXT:    movl 28(%ecx), %edx
+; FALLBACK22-NEXT:    movzbl (%eax), %ecx
+; FALLBACK22-NEXT:    movl %ecx, %eax
+; FALLBACK22-NEXT:    shlb $3, %al
+; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    sarl $31, %edx
+; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    andb $28, %cl
+; FALLBACK22-NEXT:    movzbl %cl, %edi
+; FALLBACK22-NEXT:    shrxl %eax, 32(%esp,%edi), %ecx
+; FALLBACK22-NEXT:    movl %eax, %edx
+; FALLBACK22-NEXT:    notb %dl
+; FALLBACK22-NEXT:    movl 36(%esp,%edi), %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %esi, %esi
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    orl %ecx, %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 48(%esp,%edi), %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %ecx, %ecx
+; FALLBACK22-NEXT:    shlxl %edx, %ecx, %esi
+; FALLBACK22-NEXT:    movl 44(%esp,%edi), %ecx
+; FALLBACK22-NEXT:    shrxl %eax, %ecx, %ebx
+; FALLBACK22-NEXT:    orl %ebx, %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %ecx, %ecx
+; FALLBACK22-NEXT:    shlxl %edx, %ecx, %esi
+; FALLBACK22-NEXT:    movl 40(%esp,%edi), %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %eax, %ecx, %ebx
+; FALLBACK22-NEXT:    movl %eax, %ecx
+; FALLBACK22-NEXT:    orl %ebx, %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 56(%esp,%edi), %esi
+; FALLBACK22-NEXT:    leal (%esi,%esi), %ebx
+; FALLBACK22-NEXT:    shlxl %edx, %ebx, %eax
+; FALLBACK22-NEXT:    movl 52(%esp,%edi), %ebx
+; FALLBACK22-NEXT:    shrxl %ecx, %ebx, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl %ecx, %eax
+; FALLBACK22-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK22-NEXT:    addl %ebx, %ebx
+; FALLBACK22-NEXT:    shlxl %edx, %ebx, %ebx
+; FALLBACK22-NEXT:    orl %ebp, %ebx
+; FALLBACK22-NEXT:    shrxl %ecx, %esi, %ecx
+; FALLBACK22-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl 60(%esp,%edi), %edi
+; FALLBACK22-NEXT:    sarxl %eax, %edi, %eax
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %edi
+; FALLBACK22-NEXT:    orl %ecx, %edi
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    addl %ecx, %ecx
+; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ecx
+; FALLBACK22-NEXT:    orl %esi, %ecx
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK22-NEXT:    movl %eax, 28(%edx)
+; FALLBACK22-NEXT:    movl %ecx, 4(%edx)
+; FALLBACK22-NEXT:    movl %edi, 24(%edx)
+; FALLBACK22-NEXT:    movl %ebx, 16(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 20(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 8(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 12(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, (%edx)
+; FALLBACK22-NEXT:    addl $108, %esp
+; FALLBACK22-NEXT:    popl %esi
+; FALLBACK22-NEXT:    popl %edi
+; FALLBACK22-NEXT:    popl %ebx
+; FALLBACK22-NEXT:    popl %ebp
+; FALLBACK22-NEXT:    retl
+;
+; FALLBACK23-LABEL: ashr_32bytes:
+; FALLBACK23:       # %bb.0:
+; FALLBACK23-NEXT:    pushl %ebp
+; FALLBACK23-NEXT:    pushl %ebx
+; FALLBACK23-NEXT:    pushl %edi
+; FALLBACK23-NEXT:    pushl %esi
+; FALLBACK23-NEXT:    subl $108, %esp
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT:    movups (%ecx), %xmm0
+; FALLBACK23-NEXT:    movl 16(%ecx), %esi
+; FALLBACK23-NEXT:    movl 20(%ecx), %edi
+; FALLBACK23-NEXT:    movl 24(%ecx), %ebx
+; FALLBACK23-NEXT:    movl 28(%ecx), %edx
+; FALLBACK23-NEXT:    movzbl (%eax), %eax
+; FALLBACK23-NEXT:    movl %eax, %ecx
+; FALLBACK23-NEXT:    shlb $3, %cl
+; FALLBACK23-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    sarl $31, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    andb $28, %al
+; FALLBACK23-NEXT:    movzbl %al, %ebx
+; FALLBACK23-NEXT:    movl 48(%esp,%ebx), %esi
+; FALLBACK23-NEXT:    movl 44(%esp,%ebx), %eax
+; FALLBACK23-NEXT:    movl %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 40(%esp,%ebx), %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 56(%esp,%ebx), %ebp
+; FALLBACK23-NEXT:    movl 52(%esp,%ebx), %eax
+; FALLBACK23-NEXT:    movl %eax, %edi
+; FALLBACK23-NEXT:    shrdl %cl, %ebp, %edi
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT:    movl 60(%esp,%ebx), %eax
+; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %ebp
+; FALLBACK23-NEXT:    movl 32(%esp,%ebx), %edx
+; FALLBACK23-NEXT:    movl 36(%esp,%ebx), %ebx
+; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT:    movl %ebx, 4(%eax)
+; FALLBACK23-NEXT:    movl %ebp, 24(%eax)
+; FALLBACK23-NEXT:    sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK23-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK23-NEXT:    movl %esi, 16(%eax)
+; FALLBACK23-NEXT:    movl %edi, 20(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT:    movl %esi, 8(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT:    movl %esi, 12(%eax)
+; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT:    movl %edx, (%eax)
+; FALLBACK23-NEXT:    addl $108, %esp
+; FALLBACK23-NEXT:    popl %esi
+; FALLBACK23-NEXT:    popl %edi
+; FALLBACK23-NEXT:    popl %ebx
+; FALLBACK23-NEXT:    popl %ebp
+; FALLBACK23-NEXT:    retl
+;
+; FALLBACK24-LABEL: ashr_32bytes:
+; FALLBACK24:       # %bb.0:
+; FALLBACK24-NEXT:    pushl %ebp
+; FALLBACK24-NEXT:    pushl %ebx
+; FALLBACK24-NEXT:    pushl %edi
+; FALLBACK24-NEXT:    pushl %esi
+; FALLBACK24-NEXT:    subl $108, %esp
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK24-NEXT:    movl 16(%ecx), %esi
+; FALLBACK24-NEXT:    movl 20(%ecx), %edi
+; FALLBACK24-NEXT:    movl 24(%ecx), %ebx
+; FALLBACK24-NEXT:    movl 28(%ecx), %edx
+; FALLBACK24-NEXT:    movzbl (%eax), %eax
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shlb $3, %cl
+; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    sarl $31, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    andb $28, %al
+; FALLBACK24-NEXT:    movzbl %al, %edi
+; FALLBACK24-NEXT:    movl 32(%esp,%edi), %eax
+; FALLBACK24-NEXT:    movl 36(%esp,%edi), %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    movl %ecx, %edx
+; FALLBACK24-NEXT:    movb %cl, %dh
+; FALLBACK24-NEXT:    notb %dl
+; FALLBACK24-NEXT:    addl %esi, %esi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl %eax, %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 44(%esp,%edi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %eax
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    movl 48(%esp,%edi), %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    addl %esi, %esi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl %eax, %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 40(%esp,%edi), %esi
+; FALLBACK24-NEXT:    movl %esi, %eax
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %eax, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 52(%esp,%edi), %ebp
+; FALLBACK24-NEXT:    movl %ebp, %eax
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    movl 56(%esp,%edi), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %eax, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    addl %ebp, %ebp
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    orl %eax, %ebp
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    movl 60(%esp,%edi), %eax
+; FALLBACK24-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    orl %ebx, %edi
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    addl %esi, %esi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl %ebx, %esi
+; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    sarl %cl, %eax
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK24-NEXT:    movl %esi, 4(%ecx)
+; FALLBACK24-NEXT:    movl %edi, 24(%ecx)
+; FALLBACK24-NEXT:    movl %ebp, 16(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, (%ecx)
+; FALLBACK24-NEXT:    addl $108, %esp
+; FALLBACK24-NEXT:    popl %esi
+; FALLBACK24-NEXT:    popl %edi
+; FALLBACK24-NEXT:    popl %ebx
+; FALLBACK24-NEXT:    popl %ebp
+; FALLBACK24-NEXT:    retl
+;
+; FALLBACK25-LABEL: ashr_32bytes:
+; FALLBACK25:       # %bb.0:
+; FALLBACK25-NEXT:    pushl %ebp
+; FALLBACK25-NEXT:    pushl %ebx
+; FALLBACK25-NEXT:    pushl %edi
+; FALLBACK25-NEXT:    pushl %esi
+; FALLBACK25-NEXT:    subl $108, %esp
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK25-NEXT:    movl 16(%ecx), %esi
+; FALLBACK25-NEXT:    movl 20(%ecx), %edi
+; FALLBACK25-NEXT:    movl 24(%ecx), %ebx
+; FALLBACK25-NEXT:    movl 28(%ecx), %edx
+; FALLBACK25-NEXT:    movzbl (%eax), %eax
+; FALLBACK25-NEXT:    movl %eax, %ecx
+; FALLBACK25-NEXT:    shlb $3, %cl
+; FALLBACK25-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    sarl $31, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    andb $28, %al
+; FALLBACK25-NEXT:    movzbl %al, %ebp
+; FALLBACK25-NEXT:    movl 48(%esp,%ebp), %esi
+; FALLBACK25-NEXT:    movl 44(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 40(%esp,%ebp), %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 56(%esp,%ebp), %ebx
+; FALLBACK25-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK25-NEXT:    movl 32(%esp,%ebp), %edx
+; FALLBACK25-NEXT:    movl 36(%esp,%ebp), %edi
+; FALLBACK25-NEXT:    movl %edi, %esi
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK25-NEXT:    shrdl %cl, %ebp, %esi
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT:    movl %esi, 4(%ebp)
+; FALLBACK25-NEXT:    movl %ebx, 24(%ebp)
+; FALLBACK25-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK25-NEXT:    sarl %cl, %eax
+; FALLBACK25-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK25-NEXT:    movl %edx, (%ebp)
+; FALLBACK25-NEXT:    addl $108, %esp
+; FALLBACK25-NEXT:    popl %esi
+; FALLBACK25-NEXT:    popl %edi
+; FALLBACK25-NEXT:    popl %ebx
+; FALLBACK25-NEXT:    popl %ebp
+; FALLBACK25-NEXT:    retl
+;
+; FALLBACK26-LABEL: ashr_32bytes:
+; FALLBACK26:       # %bb.0:
+; FALLBACK26-NEXT:    pushl %ebp
+; FALLBACK26-NEXT:    pushl %ebx
+; FALLBACK26-NEXT:    pushl %edi
+; FALLBACK26-NEXT:    pushl %esi
+; FALLBACK26-NEXT:    subl $108, %esp
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK26-NEXT:    movl 16(%ecx), %esi
+; FALLBACK26-NEXT:    movl 20(%ecx), %edi
+; FALLBACK26-NEXT:    movl 24(%ecx), %ebx
+; FALLBACK26-NEXT:    movl 28(%ecx), %edx
+; FALLBACK26-NEXT:    movzbl (%eax), %ecx
+; FALLBACK26-NEXT:    movl %ecx, %eax
+; FALLBACK26-NEXT:    shlb $3, %al
+; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    sarl $31, %edx
+; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    andb $28, %cl
+; FALLBACK26-NEXT:    movzbl %cl, %edi
+; FALLBACK26-NEXT:    shrxl %eax, 32(%esp,%edi), %ecx
+; FALLBACK26-NEXT:    movl %eax, %edx
+; FALLBACK26-NEXT:    notb %dl
+; FALLBACK26-NEXT:    movl 36(%esp,%edi), %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %esi, %esi
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %esi
+; FALLBACK26-NEXT:    orl %ecx, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 48(%esp,%edi), %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %ecx, %ecx
+; FALLBACK26-NEXT:    shlxl %edx, %ecx, %esi
+; FALLBACK26-NEXT:    movl 44(%esp,%edi), %ecx
+; FALLBACK26-NEXT:    shrxl %eax, %ecx, %ebx
+; FALLBACK26-NEXT:    orl %ebx, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %ecx, %ecx
+; FALLBACK26-NEXT:    shlxl %edx, %ecx, %esi
+; FALLBACK26-NEXT:    movl 40(%esp,%edi), %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %eax, %ecx, %ebx
+; FALLBACK26-NEXT:    movl %eax, %ecx
+; FALLBACK26-NEXT:    orl %ebx, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 56(%esp,%edi), %esi
+; FALLBACK26-NEXT:    leal (%esi,%esi), %ebx
+; FALLBACK26-NEXT:    shlxl %edx, %ebx, %eax
+; FALLBACK26-NEXT:    movl 52(%esp,%edi), %ebx
+; FALLBACK26-NEXT:    shrxl %ecx, %ebx, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl %ecx, %eax
+; FALLBACK26-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK26-NEXT:    addl %ebx, %ebx
+; FALLBACK26-NEXT:    shlxl %edx, %ebx, %ebx
+; FALLBACK26-NEXT:    orl %ebp, %ebx
+; FALLBACK26-NEXT:    shrxl %ecx, %esi, %ecx
+; FALLBACK26-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl 60(%esp,%edi), %edi
+; FALLBACK26-NEXT:    sarxl %eax, %edi, %eax
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %edi
+; FALLBACK26-NEXT:    orl %ecx, %edi
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    addl %ecx, %ecx
+; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ecx
+; FALLBACK26-NEXT:    orl %esi, %ecx
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK26-NEXT:    movl %eax, 28(%edx)
+; FALLBACK26-NEXT:    movl %ecx, 4(%edx)
+; FALLBACK26-NEXT:    movl %edi, 24(%edx)
+; FALLBACK26-NEXT:    movl %ebx, 16(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 20(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 8(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 12(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, (%edx)
+; FALLBACK26-NEXT:    addl $108, %esp
+; FALLBACK26-NEXT:    popl %esi
+; FALLBACK26-NEXT:    popl %edi
+; FALLBACK26-NEXT:    popl %ebx
+; FALLBACK26-NEXT:    popl %ebp
+; FALLBACK26-NEXT:    retl
+;
+; FALLBACK27-LABEL: ashr_32bytes:
+; FALLBACK27:       # %bb.0:
+; FALLBACK27-NEXT:    pushl %ebp
+; FALLBACK27-NEXT:    pushl %ebx
+; FALLBACK27-NEXT:    pushl %edi
+; FALLBACK27-NEXT:    pushl %esi
+; FALLBACK27-NEXT:    subl $108, %esp
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK27-NEXT:    movl 16(%ecx), %esi
+; FALLBACK27-NEXT:    movl 20(%ecx), %edi
+; FALLBACK27-NEXT:    movl 24(%ecx), %ebx
+; FALLBACK27-NEXT:    movl 28(%ecx), %edx
+; FALLBACK27-NEXT:    movzbl (%eax), %eax
+; FALLBACK27-NEXT:    movl %eax, %ecx
+; FALLBACK27-NEXT:    shlb $3, %cl
+; FALLBACK27-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    sarl $31, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    andb $28, %al
+; FALLBACK27-NEXT:    movzbl %al, %ebx
+; FALLBACK27-NEXT:    movl 48(%esp,%ebx), %esi
+; FALLBACK27-NEXT:    movl 44(%esp,%ebx), %eax
+; FALLBACK27-NEXT:    movl %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 40(%esp,%ebx), %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 56(%esp,%ebx), %ebp
+; FALLBACK27-NEXT:    movl 52(%esp,%ebx), %eax
+; FALLBACK27-NEXT:    movl %eax, %edi
+; FALLBACK27-NEXT:    shrdl %cl, %ebp, %edi
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT:    movl 60(%esp,%ebx), %eax
+; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %ebp
+; FALLBACK27-NEXT:    movl 32(%esp,%ebx), %edx
+; FALLBACK27-NEXT:    movl 36(%esp,%ebx), %ebx
+; FALLBACK27-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT:    movl %ebx, 4(%eax)
+; FALLBACK27-NEXT:    movl %ebp, 24(%eax)
+; FALLBACK27-NEXT:    sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK27-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK27-NEXT:    movl %esi, 16(%eax)
+; FALLBACK27-NEXT:    movl %edi, 20(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT:    movl %esi, 8(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT:    movl %esi, 12(%eax)
+; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT:    movl %edx, (%eax)
+; FALLBACK27-NEXT:    addl $108, %esp
+; FALLBACK27-NEXT:    popl %esi
+; FALLBACK27-NEXT:    popl %edi
+; FALLBACK27-NEXT:    popl %ebx
+; FALLBACK27-NEXT:    popl %ebp
+; FALLBACK27-NEXT:    retl
+;
+; FALLBACK28-LABEL: ashr_32bytes:
+; FALLBACK28:       # %bb.0:
+; FALLBACK28-NEXT:    pushl %ebp
+; FALLBACK28-NEXT:    pushl %ebx
+; FALLBACK28-NEXT:    pushl %edi
+; FALLBACK28-NEXT:    pushl %esi
+; FALLBACK28-NEXT:    subl $108, %esp
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK28-NEXT:    movl 16(%ecx), %esi
+; FALLBACK28-NEXT:    movl 20(%ecx), %edi
+; FALLBACK28-NEXT:    movl 24(%ecx), %ebx
+; FALLBACK28-NEXT:    movl 28(%ecx), %edx
+; FALLBACK28-NEXT:    movzbl (%eax), %eax
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shlb $3, %cl
+; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    sarl $31, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    andb $28, %al
+; FALLBACK28-NEXT:    movzbl %al, %edi
+; FALLBACK28-NEXT:    movl 32(%esp,%edi), %eax
+; FALLBACK28-NEXT:    movl 36(%esp,%edi), %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    movl %ecx, %edx
+; FALLBACK28-NEXT:    movb %cl, %dh
+; FALLBACK28-NEXT:    notb %dl
+; FALLBACK28-NEXT:    addl %esi, %esi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl %eax, %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 44(%esp,%edi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %eax
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    movl 48(%esp,%edi), %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    addl %esi, %esi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl %eax, %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 40(%esp,%edi), %esi
+; FALLBACK28-NEXT:    movl %esi, %eax
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %eax, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 52(%esp,%edi), %ebp
+; FALLBACK28-NEXT:    movl %ebp, %eax
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    movl 56(%esp,%edi), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %eax, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    addl %ebp, %ebp
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    orl %eax, %ebp
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    movl 60(%esp,%edi), %eax
+; FALLBACK28-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    orl %ebx, %edi
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    addl %esi, %esi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl %ebx, %esi
+; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    sarl %cl, %eax
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK28-NEXT:    movl %esi, 4(%ecx)
+; FALLBACK28-NEXT:    movl %edi, 24(%ecx)
+; FALLBACK28-NEXT:    movl %ebp, 16(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, (%ecx)
+; FALLBACK28-NEXT:    addl $108, %esp
+; FALLBACK28-NEXT:    popl %esi
+; FALLBACK28-NEXT:    popl %edi
+; FALLBACK28-NEXT:    popl %ebx
+; FALLBACK28-NEXT:    popl %ebp
+; FALLBACK28-NEXT:    retl
+;
+; FALLBACK29-LABEL: ashr_32bytes:
+; FALLBACK29:       # %bb.0:
+; FALLBACK29-NEXT:    pushl %ebp
+; FALLBACK29-NEXT:    pushl %ebx
+; FALLBACK29-NEXT:    pushl %edi
+; FALLBACK29-NEXT:    pushl %esi
+; FALLBACK29-NEXT:    subl $108, %esp
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK29-NEXT:    movl 16(%ecx), %esi
+; FALLBACK29-NEXT:    movl 20(%ecx), %edi
+; FALLBACK29-NEXT:    movl 24(%ecx), %ebx
+; FALLBACK29-NEXT:    movl 28(%ecx), %edx
+; FALLBACK29-NEXT:    movzbl (%eax), %eax
+; FALLBACK29-NEXT:    movl %eax, %ecx
+; FALLBACK29-NEXT:    shlb $3, %cl
+; FALLBACK29-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    sarl $31, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    andb $28, %al
+; FALLBACK29-NEXT:    movzbl %al, %ebp
+; FALLBACK29-NEXT:    movl 48(%esp,%ebp), %esi
+; FALLBACK29-NEXT:    movl 44(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 40(%esp,%ebp), %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 56(%esp,%ebp), %ebx
+; FALLBACK29-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK29-NEXT:    movl 32(%esp,%ebp), %edx
+; FALLBACK29-NEXT:    movl 36(%esp,%ebp), %edi
+; FALLBACK29-NEXT:    movl %edi, %esi
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK29-NEXT:    shrdl %cl, %ebp, %esi
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT:    movl %esi, 4(%ebp)
+; FALLBACK29-NEXT:    movl %ebx, 24(%ebp)
+; FALLBACK29-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK29-NEXT:    sarl %cl, %eax
+; FALLBACK29-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK29-NEXT:    movl %edx, (%ebp)
+; FALLBACK29-NEXT:    addl $108, %esp
+; FALLBACK29-NEXT:    popl %esi
+; FALLBACK29-NEXT:    popl %edi
+; FALLBACK29-NEXT:    popl %ebx
+; FALLBACK29-NEXT:    popl %ebp
+; FALLBACK29-NEXT:    retl
+;
+; FALLBACK30-LABEL: ashr_32bytes:
+; FALLBACK30:       # %bb.0:
+; FALLBACK30-NEXT:    pushl %ebp
+; FALLBACK30-NEXT:    pushl %ebx
+; FALLBACK30-NEXT:    pushl %edi
+; FALLBACK30-NEXT:    pushl %esi
+; FALLBACK30-NEXT:    subl $108, %esp
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK30-NEXT:    movl 16(%ecx), %esi
+; FALLBACK30-NEXT:    movl 20(%ecx), %edi
+; FALLBACK30-NEXT:    movl 24(%ecx), %ebx
+; FALLBACK30-NEXT:    movl 28(%ecx), %edx
+; FALLBACK30-NEXT:    movzbl (%eax), %ecx
+; FALLBACK30-NEXT:    movl %ecx, %eax
+; FALLBACK30-NEXT:    shlb $3, %al
+; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    sarl $31, %edx
+; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    andb $28, %cl
+; FALLBACK30-NEXT:    movzbl %cl, %edi
+; FALLBACK30-NEXT:    shrxl %eax, 32(%esp,%edi), %ecx
+; FALLBACK30-NEXT:    movl %eax, %edx
+; FALLBACK30-NEXT:    notb %dl
+; FALLBACK30-NEXT:    movl 36(%esp,%edi), %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %esi, %esi
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %esi
+; FALLBACK30-NEXT:    orl %ecx, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 48(%esp,%edi), %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %ecx, %ecx
+; FALLBACK30-NEXT:    shlxl %edx, %ecx, %esi
+; FALLBACK30-NEXT:    movl 44(%esp,%edi), %ecx
+; FALLBACK30-NEXT:    shrxl %eax, %ecx, %ebx
+; FALLBACK30-NEXT:    orl %ebx, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %ecx, %ecx
+; FALLBACK30-NEXT:    shlxl %edx, %ecx, %esi
+; FALLBACK30-NEXT:    movl 40(%esp,%edi), %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %eax, %ecx, %ebx
+; FALLBACK30-NEXT:    movl %eax, %ecx
+; FALLBACK30-NEXT:    orl %ebx, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 56(%esp,%edi), %esi
+; FALLBACK30-NEXT:    leal (%esi,%esi), %ebx
+; FALLBACK30-NEXT:    shlxl %edx, %ebx, %eax
+; FALLBACK30-NEXT:    movl 52(%esp,%edi), %ebx
+; FALLBACK30-NEXT:    shrxl %ecx, %ebx, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl %ecx, %eax
+; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK30-NEXT:    addl %ebx, %ebx
+; FALLBACK30-NEXT:    shlxl %edx, %ebx, %ebx
+; FALLBACK30-NEXT:    orl %ebp, %ebx
+; FALLBACK30-NEXT:    shrxl %ecx, %esi, %ecx
+; FALLBACK30-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl 60(%esp,%edi), %edi
+; FALLBACK30-NEXT:    sarxl %eax, %edi, %eax
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %edi
+; FALLBACK30-NEXT:    orl %ecx, %edi
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    addl %ecx, %ecx
+; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ecx
+; FALLBACK30-NEXT:    orl %esi, %ecx
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK30-NEXT:    movl %eax, 28(%edx)
+; FALLBACK30-NEXT:    movl %ecx, 4(%edx)
+; FALLBACK30-NEXT:    movl %edi, 24(%edx)
+; FALLBACK30-NEXT:    movl %ebx, 16(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 20(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 8(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 12(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, (%edx)
+; FALLBACK30-NEXT:    addl $108, %esp
+; FALLBACK30-NEXT:    popl %esi
+; FALLBACK30-NEXT:    popl %edi
+; FALLBACK30-NEXT:    popl %ebx
+; FALLBACK30-NEXT:    popl %ebp
+; FALLBACK30-NEXT:    retl
+;
+; FALLBACK31-LABEL: ashr_32bytes:
+; FALLBACK31:       # %bb.0:
+; FALLBACK31-NEXT:    pushl %ebp
+; FALLBACK31-NEXT:    pushl %ebx
+; FALLBACK31-NEXT:    pushl %edi
+; FALLBACK31-NEXT:    pushl %esi
+; FALLBACK31-NEXT:    subl $108, %esp
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT:    vmovups (%ecx), %xmm0
+; FALLBACK31-NEXT:    movl 16(%ecx), %esi
+; FALLBACK31-NEXT:    movl 20(%ecx), %edi
+; FALLBACK31-NEXT:    movl 24(%ecx), %ebx
+; FALLBACK31-NEXT:    movl 28(%ecx), %edx
+; FALLBACK31-NEXT:    movzbl (%eax), %eax
+; FALLBACK31-NEXT:    movl %eax, %ecx
+; FALLBACK31-NEXT:    shlb $3, %cl
+; FALLBACK31-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    sarl $31, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    andb $28, %al
+; FALLBACK31-NEXT:    movzbl %al, %ebx
+; FALLBACK31-NEXT:    movl 48(%esp,%ebx), %esi
+; FALLBACK31-NEXT:    movl 44(%esp,%ebx), %eax
+; FALLBACK31-NEXT:    movl %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 40(%esp,%ebx), %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 56(%esp,%ebx), %ebp
+; FALLBACK31-NEXT:    movl 52(%esp,%ebx), %eax
+; FALLBACK31-NEXT:    movl %eax, %edi
+; FALLBACK31-NEXT:    shrdl %cl, %ebp, %edi
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT:    movl 60(%esp,%ebx), %eax
+; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %ebp
+; FALLBACK31-NEXT:    movl 32(%esp,%ebx), %edx
+; FALLBACK31-NEXT:    movl 36(%esp,%ebx), %ebx
+; FALLBACK31-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT:    movl %ebx, 4(%eax)
+; FALLBACK31-NEXT:    movl %ebp, 24(%eax)
+; FALLBACK31-NEXT:    sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK31-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK31-NEXT:    movl %esi, 16(%eax)
+; FALLBACK31-NEXT:    movl %edi, 20(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT:    movl %esi, 8(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT:    movl %esi, 12(%eax)
+; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT:    movl %edx, (%eax)
+; FALLBACK31-NEXT:    addl $108, %esp
+; FALLBACK31-NEXT:    popl %esi
+; FALLBACK31-NEXT:    popl %edi
+; FALLBACK31-NEXT:    popl %ebx
+; FALLBACK31-NEXT:    popl %ebp
+; FALLBACK31-NEXT:    retl
+  %src = load i256, ptr %src.ptr, align 1
+  %byteOff = load i256, ptr %byteOff.ptr, align 1
+  %bitOff = shl i256 %byteOff, 3
+  %res = ashr i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; FALLBACK0-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK0:       # %bb.0:
+; FALLBACK0-NEXT:    pushq %rbx
+; FALLBACK0-NEXT:    movq (%rdi), %rcx
+; FALLBACK0-NEXT:    movq 8(%rdi), %r8
+; FALLBACK0-NEXT:    movq 16(%rdi), %r9
+; FALLBACK0-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK0-NEXT:    movzbl (%rsi), %esi
+; FALLBACK0-NEXT:    movl %esi, %eax
+; FALLBACK0-NEXT:    shlb $5, %al
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    sarq $63, %rdi
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    andb $6, %sil
+; FALLBACK0-NEXT:    movzbl %sil, %r9d
+; FALLBACK0-NEXT:    movq -64(%rsp,%r9,4), %r10
+; FALLBACK0-NEXT:    movq -56(%rsp,%r9,4), %rdi
+; FALLBACK0-NEXT:    movq %rdi, %r11
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r11
+; FALLBACK0-NEXT:    movl %eax, %esi
+; FALLBACK0-NEXT:    notb %sil
+; FALLBACK0-NEXT:    movq -48(%rsp,%r9,4), %rbx
+; FALLBACK0-NEXT:    leaq (%rbx,%rbx), %r8
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r8
+; FALLBACK0-NEXT:    orq %r11, %r8
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r10
+; FALLBACK0-NEXT:    addq %rdi, %rdi
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %rdi
+; FALLBACK0-NEXT:    orq %r10, %rdi
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %rbx
+; FALLBACK0-NEXT:    movq -40(%rsp,%r9,4), %r9
+; FALLBACK0-NEXT:    leaq (%r9,%r9), %r10
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r10
+; FALLBACK0-NEXT:    orq %rbx, %r10
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    sarq %cl, %r9
+; FALLBACK0-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK0-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK0-NEXT:    movq %rdi, (%rdx)
+; FALLBACK0-NEXT:    movq %r8, 8(%rdx)
+; FALLBACK0-NEXT:    popq %rbx
+; FALLBACK0-NEXT:    retq
+;
+; FALLBACK1-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK1:       # %bb.0:
+; FALLBACK1-NEXT:    movq (%rdi), %rax
+; FALLBACK1-NEXT:    movq 8(%rdi), %r8
+; FALLBACK1-NEXT:    movq 16(%rdi), %r9
+; FALLBACK1-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK1-NEXT:    movzbl (%rsi), %esi
+; FALLBACK1-NEXT:    movl %esi, %ecx
+; FALLBACK1-NEXT:    shlb $5, %cl
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    sarq $63, %rdi
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    andb $6, %sil
+; FALLBACK1-NEXT:    movzbl %sil, %eax
+; FALLBACK1-NEXT:    movq -56(%rsp,%rax,4), %rsi
+; FALLBACK1-NEXT:    movq -72(%rsp,%rax,4), %rdi
+; FALLBACK1-NEXT:    movq -64(%rsp,%rax,4), %r8
+; FALLBACK1-NEXT:    movq %r8, %r9
+; FALLBACK1-NEXT:    shrdq %cl, %rsi, %r9
+; FALLBACK1-NEXT:    movq -48(%rsp,%rax,4), %rax
+; FALLBACK1-NEXT:    shrdq %cl, %rax, %rsi
+; FALLBACK1-NEXT:    shrdq %cl, %r8, %rdi
+; FALLBACK1-NEXT:    sarq %cl, %rax
+; FALLBACK1-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK1-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK1-NEXT:    movq %rdi, (%rdx)
+; FALLBACK1-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK1-NEXT:    retq
+;
+; FALLBACK2-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK2:       # %bb.0:
+; FALLBACK2-NEXT:    movq (%rdi), %rcx
+; FALLBACK2-NEXT:    movq 8(%rdi), %r8
+; FALLBACK2-NEXT:    movq 16(%rdi), %r9
+; FALLBACK2-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK2-NEXT:    movzbl (%rsi), %esi
+; FALLBACK2-NEXT:    movl %esi, %eax
+; FALLBACK2-NEXT:    shlb $5, %al
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    sarq $63, %rdi
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    andb $6, %sil
+; FALLBACK2-NEXT:    movzbl %sil, %ecx
+; FALLBACK2-NEXT:    movq -64(%rsp,%rcx,4), %rsi
+; FALLBACK2-NEXT:    movq -56(%rsp,%rcx,4), %rdi
+; FALLBACK2-NEXT:    shrxq %rax, %rsi, %r8
+; FALLBACK2-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %r9
+; FALLBACK2-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK2-NEXT:    movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK2-NEXT:    sarxq %rax, %rcx, %r11
+; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    addq %rdi, %rdi
+; FALLBACK2-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK2-NEXT:    orq %r8, %rdi
+; FALLBACK2-NEXT:    addq %rsi, %rsi
+; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT:    orq %r9, %rsi
+; FALLBACK2-NEXT:    addq %rcx, %rcx
+; FALLBACK2-NEXT:    shlxq %rax, %rcx, %rax
+; FALLBACK2-NEXT:    orq %r10, %rax
+; FALLBACK2-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
+; FALLBACK2-NEXT:    movq %rsi, (%rdx)
+; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT:    retq
+;
+; FALLBACK3-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK3:       # %bb.0:
+; FALLBACK3-NEXT:    movq (%rdi), %rax
+; FALLBACK3-NEXT:    movq 8(%rdi), %r8
+; FALLBACK3-NEXT:    movq 16(%rdi), %r9
+; FALLBACK3-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK3-NEXT:    movzbl (%rsi), %esi
+; FALLBACK3-NEXT:    movl %esi, %ecx
+; FALLBACK3-NEXT:    shlb $5, %cl
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    sarq $63, %rdi
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    andb $6, %sil
+; FALLBACK3-NEXT:    movzbl %sil, %eax
+; FALLBACK3-NEXT:    movq -56(%rsp,%rax,4), %rsi
+; FALLBACK3-NEXT:    movq -72(%rsp,%rax,4), %rdi
+; FALLBACK3-NEXT:    movq -64(%rsp,%rax,4), %r8
+; FALLBACK3-NEXT:    movq %r8, %r9
+; FALLBACK3-NEXT:    shrdq %cl, %rsi, %r9
+; FALLBACK3-NEXT:    movq -48(%rsp,%rax,4), %rax
+; FALLBACK3-NEXT:    shrdq %cl, %rax, %rsi
+; FALLBACK3-NEXT:    shrdq %cl, %r8, %rdi
+; FALLBACK3-NEXT:    sarxq %rcx, %rax, %rax
+; FALLBACK3-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK3-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK3-NEXT:    movq %rdi, (%rdx)
+; FALLBACK3-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK3-NEXT:    retq
+;
+; FALLBACK4-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK4:       # %bb.0:
+; FALLBACK4-NEXT:    pushq %rbx
+; FALLBACK4-NEXT:    movups (%rdi), %xmm0
+; FALLBACK4-NEXT:    movq 16(%rdi), %rcx
+; FALLBACK4-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK4-NEXT:    movzbl (%rsi), %esi
+; FALLBACK4-NEXT:    movl %esi, %eax
+; FALLBACK4-NEXT:    shlb $5, %al
+; FALLBACK4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    sarq $63, %rdi
+; FALLBACK4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    andb $6, %sil
+; FALLBACK4-NEXT:    movzbl %sil, %r9d
+; FALLBACK4-NEXT:    movq -64(%rsp,%r9,4), %r10
+; FALLBACK4-NEXT:    movq -56(%rsp,%r9,4), %r8
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r10
+; FALLBACK4-NEXT:    movl %eax, %esi
+; FALLBACK4-NEXT:    notb %sil
+; FALLBACK4-NEXT:    leaq (%r8,%r8), %rdi
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %rdi
+; FALLBACK4-NEXT:    orq %r10, %rdi
+; FALLBACK4-NEXT:    movq -48(%rsp,%r9,4), %r10
+; FALLBACK4-NEXT:    movq %r10, %r11
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r11
+; FALLBACK4-NEXT:    movq -40(%rsp,%r9,4), %r9
+; FALLBACK4-NEXT:    leaq (%r9,%r9), %rbx
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %rbx
+; FALLBACK4-NEXT:    orq %r11, %rbx
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r8
+; FALLBACK4-NEXT:    addq %r10, %r10
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r10
+; FALLBACK4-NEXT:    orq %r8, %r10
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    sarq %cl, %r9
+; FALLBACK4-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK4-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK4-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK4-NEXT:    movq %rdi, (%rdx)
+; FALLBACK4-NEXT:    popq %rbx
+; FALLBACK4-NEXT:    retq
+;
+; FALLBACK5-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK5:       # %bb.0:
+; FALLBACK5-NEXT:    movups (%rdi), %xmm0
+; FALLBACK5-NEXT:    movq 16(%rdi), %rax
+; FALLBACK5-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK5-NEXT:    movzbl (%rsi), %esi
+; FALLBACK5-NEXT:    movl %esi, %ecx
+; FALLBACK5-NEXT:    shlb $5, %cl
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    sarq $63, %rdi
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    andb $6, %sil
+; FALLBACK5-NEXT:    movzbl %sil, %eax
+; FALLBACK5-NEXT:    movq -48(%rsp,%rax,4), %rsi
+; FALLBACK5-NEXT:    movq -56(%rsp,%rax,4), %rdi
+; FALLBACK5-NEXT:    movq %rdi, %r8
+; FALLBACK5-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK5-NEXT:    movq -72(%rsp,%rax,4), %r9
+; FALLBACK5-NEXT:    movq -64(%rsp,%rax,4), %rax
+; FALLBACK5-NEXT:    movq %rax, %r10
+; FALLBACK5-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK5-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK5-NEXT:    sarq %cl, %rsi
+; FALLBACK5-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK5-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK5-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT:    movq %r9, (%rdx)
+; FALLBACK5-NEXT:    retq
+;
+; FALLBACK6-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK6:       # %bb.0:
+; FALLBACK6-NEXT:    movups (%rdi), %xmm0
+; FALLBACK6-NEXT:    movq 16(%rdi), %rcx
+; FALLBACK6-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK6-NEXT:    movzbl (%rsi), %esi
+; FALLBACK6-NEXT:    movl %esi, %eax
+; FALLBACK6-NEXT:    shlb $5, %al
+; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    sarq $63, %rdi
+; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    andb $6, %sil
+; FALLBACK6-NEXT:    movzbl %sil, %ecx
+; FALLBACK6-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
+; FALLBACK6-NEXT:    movq -64(%rsp,%rcx,4), %rdi
+; FALLBACK6-NEXT:    movq -56(%rsp,%rcx,4), %r8
+; FALLBACK6-NEXT:    shrxq %rax, %r8, %r9
+; FALLBACK6-NEXT:    movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK6-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK6-NEXT:    sarxq %rax, %rcx, %r11
+; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT:    notb %al
+; FALLBACK6-NEXT:    addq %rdi, %rdi
+; FALLBACK6-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT:    orq %rsi, %rdi
+; FALLBACK6-NEXT:    addq %rcx, %rcx
+; FALLBACK6-NEXT:    shlxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT:    orq %r9, %rcx
+; FALLBACK6-NEXT:    addq %r8, %r8
+; FALLBACK6-NEXT:    shlxq %rax, %r8, %rax
+; FALLBACK6-NEXT:    orq %r10, %rax
+; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT:    movq %rdi, (%rdx)
+; FALLBACK6-NEXT:    retq
+;
+; FALLBACK7-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK7:       # %bb.0:
+; FALLBACK7-NEXT:    movups (%rdi), %xmm0
+; FALLBACK7-NEXT:    movq 16(%rdi), %rax
+; FALLBACK7-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK7-NEXT:    movzbl (%rsi), %esi
+; FALLBACK7-NEXT:    movl %esi, %ecx
+; FALLBACK7-NEXT:    shlb $5, %cl
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    sarq $63, %rdi
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    andb $6, %sil
+; FALLBACK7-NEXT:    movzbl %sil, %eax
+; FALLBACK7-NEXT:    movq -48(%rsp,%rax,4), %rsi
+; FALLBACK7-NEXT:    movq -56(%rsp,%rax,4), %rdi
+; FALLBACK7-NEXT:    movq %rdi, %r8
+; FALLBACK7-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK7-NEXT:    movq -72(%rsp,%rax,4), %r9
+; FALLBACK7-NEXT:    movq -64(%rsp,%rax,4), %rax
+; FALLBACK7-NEXT:    movq %rax, %r10
+; FALLBACK7-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK7-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK7-NEXT:    sarxq %rcx, %rsi, %rax
+; FALLBACK7-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK7-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK7-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK7-NEXT:    movq %r9, (%rdx)
+; FALLBACK7-NEXT:    retq
+;
+; FALLBACK8-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK8:       # %bb.0:
+; FALLBACK8-NEXT:    pushq %rbx
+; FALLBACK8-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK8-NEXT:    movq 16(%rdi), %rcx
+; FALLBACK8-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK8-NEXT:    movzbl (%rsi), %esi
+; FALLBACK8-NEXT:    movl %esi, %eax
+; FALLBACK8-NEXT:    shlb $5, %al
+; FALLBACK8-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    sarq $63, %rdi
+; FALLBACK8-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    andb $6, %sil
+; FALLBACK8-NEXT:    movzbl %sil, %r9d
+; FALLBACK8-NEXT:    movq -64(%rsp,%r9,4), %r10
+; FALLBACK8-NEXT:    movq -56(%rsp,%r9,4), %r8
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r10
+; FALLBACK8-NEXT:    movl %eax, %esi
+; FALLBACK8-NEXT:    notb %sil
+; FALLBACK8-NEXT:    leaq (%r8,%r8), %rdi
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %rdi
+; FALLBACK8-NEXT:    orq %r10, %rdi
+; FALLBACK8-NEXT:    movq -48(%rsp,%r9,4), %r10
+; FALLBACK8-NEXT:    movq %r10, %r11
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r11
+; FALLBACK8-NEXT:    movq -40(%rsp,%r9,4), %r9
+; FALLBACK8-NEXT:    leaq (%r9,%r9), %rbx
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %rbx
+; FALLBACK8-NEXT:    orq %r11, %rbx
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r8
+; FALLBACK8-NEXT:    addq %r10, %r10
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r10
+; FALLBACK8-NEXT:    orq %r8, %r10
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    sarq %cl, %r9
+; FALLBACK8-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK8-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK8-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK8-NEXT:    movq %rdi, (%rdx)
+; FALLBACK8-NEXT:    popq %rbx
+; FALLBACK8-NEXT:    retq
+;
+; FALLBACK9-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK9:       # %bb.0:
+; FALLBACK9-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK9-NEXT:    movq 16(%rdi), %rax
+; FALLBACK9-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK9-NEXT:    movzbl (%rsi), %esi
+; FALLBACK9-NEXT:    movl %esi, %ecx
+; FALLBACK9-NEXT:    shlb $5, %cl
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    sarq $63, %rdi
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    andb $6, %sil
+; FALLBACK9-NEXT:    movzbl %sil, %eax
+; FALLBACK9-NEXT:    movq -48(%rsp,%rax,4), %rsi
+; FALLBACK9-NEXT:    movq -56(%rsp,%rax,4), %rdi
+; FALLBACK9-NEXT:    movq %rdi, %r8
+; FALLBACK9-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK9-NEXT:    movq -72(%rsp,%rax,4), %r9
+; FALLBACK9-NEXT:    movq -64(%rsp,%rax,4), %rax
+; FALLBACK9-NEXT:    movq %rax, %r10
+; FALLBACK9-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK9-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK9-NEXT:    sarq %cl, %rsi
+; FALLBACK9-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK9-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK9-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT:    movq %r9, (%rdx)
+; FALLBACK9-NEXT:    retq
+;
+; FALLBACK10-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK10:       # %bb.0:
+; FALLBACK10-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK10-NEXT:    movq 16(%rdi), %rcx
+; FALLBACK10-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK10-NEXT:    movzbl (%rsi), %esi
+; FALLBACK10-NEXT:    movl %esi, %eax
+; FALLBACK10-NEXT:    shlb $5, %al
+; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    sarq $63, %rdi
+; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    andb $6, %sil
+; FALLBACK10-NEXT:    movzbl %sil, %ecx
+; FALLBACK10-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
+; FALLBACK10-NEXT:    movq -64(%rsp,%rcx,4), %rdi
+; FALLBACK10-NEXT:    movq -56(%rsp,%rcx,4), %r8
+; FALLBACK10-NEXT:    shrxq %rax, %r8, %r9
+; FALLBACK10-NEXT:    movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK10-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK10-NEXT:    sarxq %rax, %rcx, %r11
+; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT:    notb %al
+; FALLBACK10-NEXT:    addq %rdi, %rdi
+; FALLBACK10-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT:    orq %rsi, %rdi
+; FALLBACK10-NEXT:    addq %rcx, %rcx
+; FALLBACK10-NEXT:    shlxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT:    orq %r9, %rcx
+; FALLBACK10-NEXT:    addq %r8, %r8
+; FALLBACK10-NEXT:    shlxq %rax, %r8, %rax
+; FALLBACK10-NEXT:    orq %r10, %rax
+; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT:    movq %rdi, (%rdx)
+; FALLBACK10-NEXT:    retq
+;
+; FALLBACK11-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK11:       # %bb.0:
+; FALLBACK11-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK11-NEXT:    movq 16(%rdi), %rax
+; FALLBACK11-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK11-NEXT:    movzbl (%rsi), %esi
+; FALLBACK11-NEXT:    movl %esi, %ecx
+; FALLBACK11-NEXT:    shlb $5, %cl
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    sarq $63, %rdi
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    andb $6, %sil
+; FALLBACK11-NEXT:    movzbl %sil, %eax
+; FALLBACK11-NEXT:    movq -48(%rsp,%rax,4), %rsi
+; FALLBACK11-NEXT:    movq -56(%rsp,%rax,4), %rdi
+; FALLBACK11-NEXT:    movq %rdi, %r8
+; FALLBACK11-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK11-NEXT:    movq -72(%rsp,%rax,4), %r9
+; FALLBACK11-NEXT:    movq -64(%rsp,%rax,4), %rax
+; FALLBACK11-NEXT:    movq %rax, %r10
+; FALLBACK11-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK11-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK11-NEXT:    sarxq %rcx, %rsi, %rax
+; FALLBACK11-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK11-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK11-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK11-NEXT:    movq %r9, (%rdx)
+; FALLBACK11-NEXT:    retq
+;
+; FALLBACK12-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK12:       # %bb.0:
+; FALLBACK12-NEXT:    pushq %rbx
+; FALLBACK12-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK12-NEXT:    movq 16(%rdi), %rcx
+; FALLBACK12-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK12-NEXT:    movzbl (%rsi), %esi
+; FALLBACK12-NEXT:    movl %esi, %eax
+; FALLBACK12-NEXT:    shlb $5, %al
+; FALLBACK12-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    sarq $63, %rdi
+; FALLBACK12-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    andb $6, %sil
+; FALLBACK12-NEXT:    movzbl %sil, %r9d
+; FALLBACK12-NEXT:    movq -64(%rsp,%r9,4), %r10
+; FALLBACK12-NEXT:    movq -56(%rsp,%r9,4), %r8
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r10
+; FALLBACK12-NEXT:    movl %eax, %esi
+; FALLBACK12-NEXT:    notb %sil
+; FALLBACK12-NEXT:    leaq (%r8,%r8), %rdi
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %rdi
+; FALLBACK12-NEXT:    orq %r10, %rdi
+; FALLBACK12-NEXT:    movq -48(%rsp,%r9,4), %r10
+; FALLBACK12-NEXT:    movq %r10, %r11
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r11
+; FALLBACK12-NEXT:    movq -40(%rsp,%r9,4), %r9
+; FALLBACK12-NEXT:    leaq (%r9,%r9), %rbx
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %rbx
+; FALLBACK12-NEXT:    orq %r11, %rbx
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r8
+; FALLBACK12-NEXT:    addq %r10, %r10
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r10
+; FALLBACK12-NEXT:    orq %r8, %r10
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    sarq %cl, %r9
+; FALLBACK12-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK12-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK12-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK12-NEXT:    movq %rdi, (%rdx)
+; FALLBACK12-NEXT:    popq %rbx
+; FALLBACK12-NEXT:    retq
+;
+; FALLBACK13-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK13:       # %bb.0:
+; FALLBACK13-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK13-NEXT:    movq 16(%rdi), %rax
+; FALLBACK13-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK13-NEXT:    movzbl (%rsi), %esi
+; FALLBACK13-NEXT:    movl %esi, %ecx
+; FALLBACK13-NEXT:    shlb $5, %cl
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    sarq $63, %rdi
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    andb $6, %sil
+; FALLBACK13-NEXT:    movzbl %sil, %eax
+; FALLBACK13-NEXT:    movq -48(%rsp,%rax,4), %rsi
+; FALLBACK13-NEXT:    movq -56(%rsp,%rax,4), %rdi
+; FALLBACK13-NEXT:    movq %rdi, %r8
+; FALLBACK13-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK13-NEXT:    movq -72(%rsp,%rax,4), %r9
+; FALLBACK13-NEXT:    movq -64(%rsp,%rax,4), %rax
+; FALLBACK13-NEXT:    movq %rax, %r10
+; FALLBACK13-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK13-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK13-NEXT:    sarq %cl, %rsi
+; FALLBACK13-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK13-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK13-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK13-NEXT:    movq %r9, (%rdx)
+; FALLBACK13-NEXT:    retq
+;
+; FALLBACK14-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK14:       # %bb.0:
+; FALLBACK14-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK14-NEXT:    movq 16(%rdi), %rcx
+; FALLBACK14-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK14-NEXT:    movzbl (%rsi), %esi
+; FALLBACK14-NEXT:    movl %esi, %eax
+; FALLBACK14-NEXT:    shlb $5, %al
+; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    sarq $63, %rdi
+; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    andb $6, %sil
+; FALLBACK14-NEXT:    movzbl %sil, %ecx
+; FALLBACK14-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
+; FALLBACK14-NEXT:    movq -64(%rsp,%rcx,4), %rdi
+; FALLBACK14-NEXT:    movq -56(%rsp,%rcx,4), %r8
+; FALLBACK14-NEXT:    shrxq %rax, %r8, %r9
+; FALLBACK14-NEXT:    movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK14-NEXT:    shrxq %rax, %rdi, %r10
+; FALLBACK14-NEXT:    sarxq %rax, %rcx, %r11
+; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT:    notb %al
+; FALLBACK14-NEXT:    addq %rdi, %rdi
+; FALLBACK14-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT:    orq %rsi, %rdi
+; FALLBACK14-NEXT:    addq %rcx, %rcx
+; FALLBACK14-NEXT:    shlxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT:    orq %r9, %rcx
+; FALLBACK14-NEXT:    addq %r8, %r8
+; FALLBACK14-NEXT:    shlxq %rax, %r8, %rax
+; FALLBACK14-NEXT:    orq %r10, %rax
+; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT:    movq %rdi, (%rdx)
+; FALLBACK14-NEXT:    retq
+;
+; FALLBACK15-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK15:       # %bb.0:
+; FALLBACK15-NEXT:    vmovups (%rdi), %xmm0
+; FALLBACK15-NEXT:    movq 16(%rdi), %rax
+; FALLBACK15-NEXT:    movq 24(%rdi), %rdi
+; FALLBACK15-NEXT:    movzbl (%rsi), %esi
+; FALLBACK15-NEXT:    movl %esi, %ecx
+; FALLBACK15-NEXT:    shlb $5, %cl
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    sarq $63, %rdi
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    andb $6, %sil
+; FALLBACK15-NEXT:    movzbl %sil, %eax
+; FALLBACK15-NEXT:    movq -48(%rsp,%rax,4), %rsi
+; FALLBACK15-NEXT:    movq -56(%rsp,%rax,4), %rdi
+; FALLBACK15-NEXT:    movq %rdi, %r8
+; FALLBACK15-NEXT:    shrdq %cl, %rsi, %r8
+; FALLBACK15-NEXT:    movq -72(%rsp,%rax,4), %r9
+; FALLBACK15-NEXT:    movq -64(%rsp,%rax,4), %rax
+; FALLBACK15-NEXT:    movq %rax, %r10
+; FALLBACK15-NEXT:    shrdq %cl, %rdi, %r10
+; FALLBACK15-NEXT:    shrdq %cl, %rax, %r9
+; FALLBACK15-NEXT:    sarxq %rcx, %rsi, %rax
+; FALLBACK15-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK15-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK15-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK15-NEXT:    movq %r9, (%rdx)
+; FALLBACK15-NEXT:    retq
+;
+; X86-SSE2-LABEL: ashr_32bytes_dwordOff:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    pushl %ebx
+; X86-SSE2-NEXT:    pushl %edi
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    subl $92, %esp
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl (%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 4(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 8(%eax), %edi
+; X86-SSE2-NEXT:    movl 12(%eax), %ebx
+; X86-SSE2-NEXT:    movl 16(%eax), %ebp
+; X86-SSE2-NEXT:    movl 20(%eax), %esi
+; X86-SSE2-NEXT:    movl 24(%eax), %edx
+; X86-SSE2-NEXT:    movl 28(%eax), %ecx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    sarl $31, %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    andl $7, %eax
+; X86-SSE2-NEXT:    movl 16(%esp,%eax,4), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 20(%esp,%eax,4), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 28(%esp,%eax,4), %esi
+; X86-SSE2-NEXT:    movl 24(%esp,%eax,4), %edi
+; X86-SSE2-NEXT:    movl 36(%esp,%eax,4), %ebx
+; X86-SSE2-NEXT:    movl 32(%esp,%eax,4), %ebp
+; X86-SSE2-NEXT:    movl 44(%esp,%eax,4), %edx
+; X86-SSE2-NEXT:    movl 40(%esp,%eax,4), %ecx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
+; X86-SSE2-NEXT:    movl %edx, 28(%eax)
+; X86-SSE2-NEXT:    movl %ebp, 16(%eax)
+; X86-SSE2-NEXT:    movl %ebx, 20(%eax)
+; X86-SSE2-NEXT:    movl %edi, 8(%eax)
+; X86-SSE2-NEXT:    movl %esi, 12(%eax)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl %ecx, (%eax)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
+; X86-SSE2-NEXT:    addl $92, %esp
+; X86-SSE2-NEXT:    popl %esi
+; X86-SSE2-NEXT:    popl %edi
+; X86-SSE2-NEXT:    popl %ebx
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: ashr_32bytes_dwordOff:
+; X86-SSE42:       # %bb.0:
+; X86-SSE42-NEXT:    pushl %ebx
+; X86-SSE42-NEXT:    pushl %edi
+; X86-SSE42-NEXT:    pushl %esi
+; X86-SSE42-NEXT:    subl $64, %esp
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE42-NEXT:    movups (%edx), %xmm0
+; X86-SSE42-NEXT:    movl 16(%edx), %esi
+; X86-SSE42-NEXT:    movl 20(%edx), %edi
+; X86-SSE42-NEXT:    movl 24(%edx), %ebx
+; X86-SSE42-NEXT:    movl 28(%edx), %edx
+; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
+; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
+; X86-SSE42-NEXT:    sarl $31, %edx
+; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    andl $7, %ecx
+; X86-SSE42-NEXT:    movups (%esp,%ecx,4), %xmm0
+; X86-SSE42-NEXT:    movups 16(%esp,%ecx,4), %xmm1
+; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
+; X86-SSE42-NEXT:    movups %xmm0, (%eax)
 ; X86-SSE42-NEXT:    addl $64, %esp
+; X86-SSE42-NEXT:    popl %esi
+; X86-SSE42-NEXT:    popl %edi
+; X86-SSE42-NEXT:    popl %ebx
 ; X86-SSE42-NEXT:    retl
 ;
-; X86-AVX-LABEL: shl_32bytes:
+; X86-AVX-LABEL: ashr_32bytes_dwordOff:
 ; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %ebx
+; X86-AVX-NEXT:    pushl %edi
+; X86-AVX-NEXT:    pushl %esi
 ; X86-AVX-NEXT:    subl $64, %esp
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    vmovups (%edx), %ymm0
+; X86-AVX-NEXT:    vmovups (%edx), %xmm0
+; X86-AVX-NEXT:    movl 16(%edx), %esi
+; X86-AVX-NEXT:    movl 20(%edx), %edi
+; X86-AVX-NEXT:    movl 24(%edx), %ebx
+; X86-AVX-NEXT:    movl 28(%edx), %edx
 ; X86-AVX-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT:    vmovups %ymm1, (%esp)
-; X86-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    andb $31, %cl
-; X86-AVX-NEXT:    negb %cl
-; X86-AVX-NEXT:    movsbl %cl, %ecx
-; X86-AVX-NEXT:    vmovups 32(%esp,%ecx), %xmm0
-; X86-AVX-NEXT:    vmovups 48(%esp,%ecx), %xmm1
+; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    vmovaps %xmm0, (%esp)
+; X86-AVX-NEXT:    sarl $31, %edx
+; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    andl $7, %ecx
+; X86-AVX-NEXT:    vmovups (%esp,%ecx,4), %xmm0
+; X86-AVX-NEXT:    vmovups 16(%esp,%ecx,4), %xmm1
 ; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
 ; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
 ; X86-AVX-NEXT:    addl $64, %esp
-; X86-AVX-NEXT:    vzeroupper
+; X86-AVX-NEXT:    popl %esi
+; X86-AVX-NEXT:    popl %edi
+; X86-AVX-NEXT:    popl %ebx
 ; X86-AVX-NEXT:    retl
   %src = load i256, ptr %src.ptr, align 1
-  %byteOff = load i256, ptr %byteOff.ptr, align 1
-  %bitOff = shl i256 %byteOff, 3
-  %res = shl i256 %src, %bitOff
+  %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+  %bitOff = shl i256 %dwordOff, 5
+  %res = ashr i256 %src, %bitOff
   store i256 %res, ptr %dst, align 1
   ret void
 }
-define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: ashr_32bytes:
+
+define void @ashr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
+; X64-SSE2-LABEL: ashr_32bytes_qwordOff:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movq (%rdi), %rax
 ; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
@@ -1446,18 +11832,18 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    andl $31, %esi
-; X64-SSE2-NEXT:    movq -64(%rsp,%rsi), %rax
-; X64-SSE2-NEXT:    movq -56(%rsp,%rsi), %rcx
-; X64-SSE2-NEXT:    movq -40(%rsp,%rsi), %rdi
-; X64-SSE2-NEXT:    movq -48(%rsp,%rsi), %rsi
+; X64-SSE2-NEXT:    andl $3, %esi
+; X64-SSE2-NEXT:    movq -72(%rsp,%rsi,8), %rax
+; X64-SSE2-NEXT:    movq -64(%rsp,%rsi,8), %rcx
+; X64-SSE2-NEXT:    movq -48(%rsp,%rsi,8), %rdi
+; X64-SSE2-NEXT:    movq -56(%rsp,%rsi,8), %rsi
 ; X64-SSE2-NEXT:    movq %rsi, 16(%rdx)
 ; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
 ; X64-SSE2-NEXT:    movq %rax, (%rdx)
 ; X64-SSE2-NEXT:    movq %rcx, 8(%rdx)
 ; X64-SSE2-NEXT:    retq
 ;
-; X64-SSE42-LABEL: ashr_32bytes:
+; X64-SSE42-LABEL: ashr_32bytes_qwordOff:
 ; X64-SSE42:       # %bb.0:
 ; X64-SSE42-NEXT:    movups (%rdi), %xmm0
 ; X64-SSE42-NEXT:    movq 16(%rdi), %rax
@@ -1465,20 +11851,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-SSE42-NEXT:    movzbl (%rsi), %esi
 ; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-SSE42-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-SSE42-NEXT:    sarq $63, %rcx
 ; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    andl $31, %esi
-; X64-SSE42-NEXT:    movups -64(%rsp,%rsi), %xmm0
-; X64-SSE42-NEXT:    movups -48(%rsp,%rsi), %xmm1
+; X64-SSE42-NEXT:    andl $3, %esi
+; X64-SSE42-NEXT:    movups -72(%rsp,%rsi,8), %xmm0
+; X64-SSE42-NEXT:    movups -56(%rsp,%rsi,8), %xmm1
 ; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
 ; X64-SSE42-NEXT:    retq
 ;
-; X64-AVX-LABEL: ashr_32bytes:
+; X64-AVX-LABEL: ashr_32bytes_qwordOff:
 ; X64-AVX:       # %bb.0:
 ; X64-AVX-NEXT:    vmovups (%rdi), %xmm0
 ; X64-AVX-NEXT:    movq 16(%rdi), %rax
@@ -1486,31 +11872,31 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-AVX-NEXT:    movzbl (%rsi), %esi
 ; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-AVX-NEXT:    sarq $63, %rcx
 ; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    andl $31, %esi
-; X64-AVX-NEXT:    vmovups -64(%rsp,%rsi), %xmm0
-; X64-AVX-NEXT:    vmovups -48(%rsp,%rsi), %xmm1
+; X64-AVX-NEXT:    andl $3, %esi
+; X64-AVX-NEXT:    vmovups -72(%rsp,%rsi,8), %xmm0
+; X64-AVX-NEXT:    vmovups -56(%rsp,%rsi,8), %xmm1
 ; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
 ; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
 ; X64-AVX-NEXT:    retq
 ;
-; X86-SSE2-LABEL: ashr_32bytes:
+; X86-SSE2-LABEL: ashr_32bytes_qwordOff:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    pushl %ebx
 ; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $72, %esp
+; X86-SSE2-NEXT:    subl $92, %esp
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl (%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SSE2-NEXT:    movl 8(%eax), %edi
 ; X86-SSE2-NEXT:    movl 12(%eax), %ebx
 ; X86-SSE2-NEXT:    movl 16(%eax), %ebp
@@ -1525,7 +11911,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
@@ -1538,17 +11924,17 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andl $31, %eax
-; X86-SSE2-NEXT:    movl 8(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    andl $3, %eax
+; X86-SSE2-NEXT:    movl 16(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 12(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 20(%esp,%eax), %esi
-; X86-SSE2-NEXT:    movl 16(%esp,%eax), %edi
-; X86-SSE2-NEXT:    movl 28(%esp,%eax), %ebx
-; X86-SSE2-NEXT:    movl 24(%esp,%eax), %ebp
-; X86-SSE2-NEXT:    movl 36(%esp,%eax), %edx
-; X86-SSE2-NEXT:    movl 32(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 20(%esp,%eax,8), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 28(%esp,%eax,8), %esi
+; X86-SSE2-NEXT:    movl 24(%esp,%eax,8), %edi
+; X86-SSE2-NEXT:    movl 36(%esp,%eax,8), %ebx
+; X86-SSE2-NEXT:    movl 32(%esp,%eax,8), %ebp
+; X86-SSE2-NEXT:    movl 44(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl 40(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
 ; X86-SSE2-NEXT:    movl %edx, 28(%eax)
@@ -1558,16 +11944,16 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    movl %esi, 12(%eax)
 ; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    addl $72, %esp
+; X86-SSE2-NEXT:    addl $92, %esp
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    popl %ebx
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-SSE42-LABEL: ashr_32bytes:
+; X86-SSE42-LABEL: ashr_32bytes_qwordOff:
 ; X86-SSE42:       # %bb.0:
 ; X86-SSE42-NEXT:    pushl %ebx
 ; X86-SSE42-NEXT:    pushl %edi
@@ -1586,7 +11972,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE42-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm0, (%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
 ; X86-SSE42-NEXT:    sarl $31, %edx
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
@@ -1596,9 +11982,9 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    andl $31, %ecx
-; X86-SSE42-NEXT:    movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%esp,%ecx), %xmm1
+; X86-SSE42-NEXT:    andl $3, %ecx
+; X86-SSE42-NEXT:    movups (%esp,%ecx,8), %xmm0
+; X86-SSE42-NEXT:    movups 16(%esp,%ecx,8), %xmm1
 ; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
 ; X86-SSE42-NEXT:    movups %xmm0, (%eax)
 ; X86-SSE42-NEXT:    addl $64, %esp
@@ -1607,7 +11993,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE42-NEXT:    popl %ebx
 ; X86-SSE42-NEXT:    retl
 ;
-; X86-AVX-LABEL: ashr_32bytes:
+; X86-AVX-LABEL: ashr_32bytes_qwordOff:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    pushl %ebx
 ; X86-AVX-NEXT:    pushl %edi
@@ -1626,7 +12012,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-AVX-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    vmovups %xmm0, (%esp)
+; X86-AVX-NEXT:    vmovaps %xmm0, (%esp)
 ; X86-AVX-NEXT:    sarl $31, %edx
 ; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
@@ -1636,9 +12022,9 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    andl $31, %ecx
-; X86-AVX-NEXT:    vmovups (%esp,%ecx), %xmm0
-; X86-AVX-NEXT:    vmovups 16(%esp,%ecx), %xmm1
+; X86-AVX-NEXT:    andl $3, %ecx
+; X86-AVX-NEXT:    vmovups (%esp,%ecx,8), %xmm0
+; X86-AVX-NEXT:    vmovups 16(%esp,%ecx,8), %xmm1
 ; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
 ; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
 ; X86-AVX-NEXT:    addl $64, %esp
@@ -1647,15 +12033,3662 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-AVX-NEXT:    popl %ebx
 ; X86-AVX-NEXT:    retl
   %src = load i256, ptr %src.ptr, align 1
-  %byteOff = load i256, ptr %byteOff.ptr, align 1
-  %bitOff = shl i256 %byteOff, 3
+  %qwordOff = load i256, ptr %qwordOff.ptr, align 1
+  %bitOff = shl i256 %qwordOff, 6
   %res = ashr i256 %src, %bitOff
   store i256 %res, ptr %dst, align 1
   ret void
 }
 
 define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: lshr_64bytes:
+; FALLBACK0-LABEL: lshr_64bytes:
+; FALLBACK0:       # %bb.0:
+; FALLBACK0-NEXT:    pushq %r15
+; FALLBACK0-NEXT:    pushq %r14
+; FALLBACK0-NEXT:    pushq %r13
+; FALLBACK0-NEXT:    pushq %r12
+; FALLBACK0-NEXT:    pushq %rbx
+; FALLBACK0-NEXT:    movq (%rdi), %rax
+; FALLBACK0-NEXT:    movq 8(%rdi), %rcx
+; FALLBACK0-NEXT:    movq 16(%rdi), %r8
+; FALLBACK0-NEXT:    movq 24(%rdi), %r9
+; FALLBACK0-NEXT:    movq 32(%rdi), %r10
+; FALLBACK0-NEXT:    movq 40(%rdi), %r11
+; FALLBACK0-NEXT:    movq 48(%rdi), %rbx
+; FALLBACK0-NEXT:    movq 56(%rdi), %r14
+; FALLBACK0-NEXT:    movl (%rsi), %edi
+; FALLBACK0-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    leal (,%rdi,8), %eax
+; FALLBACK0-NEXT:    andl $56, %eax
+; FALLBACK0-NEXT:    andl $56, %edi
+; FALLBACK0-NEXT:    movq -128(%rsp,%rdi), %r10
+; FALLBACK0-NEXT:    movq -120(%rsp,%rdi), %r8
+; FALLBACK0-NEXT:    movq %r8, %r11
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r11
+; FALLBACK0-NEXT:    movl %eax, %esi
+; FALLBACK0-NEXT:    notb %sil
+; FALLBACK0-NEXT:    movq -112(%rsp,%rdi), %rbx
+; FALLBACK0-NEXT:    leaq (%rbx,%rbx), %r9
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r9
+; FALLBACK0-NEXT:    orq %r11, %r9
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r10
+; FALLBACK0-NEXT:    addq %r8, %r8
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r8
+; FALLBACK0-NEXT:    orq %r10, %r8
+; FALLBACK0-NEXT:    movq -104(%rsp,%rdi), %r10
+; FALLBACK0-NEXT:    movq %r10, %r15
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r15
+; FALLBACK0-NEXT:    movq -96(%rsp,%rdi), %r14
+; FALLBACK0-NEXT:    leaq (%r14,%r14), %r11
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r11
+; FALLBACK0-NEXT:    orq %r15, %r11
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %rbx
+; FALLBACK0-NEXT:    addq %r10, %r10
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r10
+; FALLBACK0-NEXT:    orq %rbx, %r10
+; FALLBACK0-NEXT:    movq -88(%rsp,%rdi), %rbx
+; FALLBACK0-NEXT:    movq %rbx, %r12
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r12
+; FALLBACK0-NEXT:    movq -80(%rsp,%rdi), %r13
+; FALLBACK0-NEXT:    leaq (%r13,%r13), %r15
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r15
+; FALLBACK0-NEXT:    orq %r12, %r15
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r14
+; FALLBACK0-NEXT:    addq %rbx, %rbx
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %rbx
+; FALLBACK0-NEXT:    orq %r14, %rbx
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r13
+; FALLBACK0-NEXT:    movq -72(%rsp,%rdi), %rdi
+; FALLBACK0-NEXT:    leaq (%rdi,%rdi), %r14
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r14
+; FALLBACK0-NEXT:    orq %r13, %r14
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %rdi
+; FALLBACK0-NEXT:    movq %rdi, 56(%rdx)
+; FALLBACK0-NEXT:    movq %r14, 48(%rdx)
+; FALLBACK0-NEXT:    movq %rbx, 32(%rdx)
+; FALLBACK0-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK0-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK0-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK0-NEXT:    movq %r8, (%rdx)
+; FALLBACK0-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK0-NEXT:    popq %rbx
+; FALLBACK0-NEXT:    popq %r12
+; FALLBACK0-NEXT:    popq %r13
+; FALLBACK0-NEXT:    popq %r14
+; FALLBACK0-NEXT:    popq %r15
+; FALLBACK0-NEXT:    retq
+;
+; FALLBACK1-LABEL: lshr_64bytes:
+; FALLBACK1:       # %bb.0:
+; FALLBACK1-NEXT:    pushq %r15
+; FALLBACK1-NEXT:    pushq %r14
+; FALLBACK1-NEXT:    pushq %rbx
+; FALLBACK1-NEXT:    movq (%rdi), %rcx
+; FALLBACK1-NEXT:    movq 8(%rdi), %r8
+; FALLBACK1-NEXT:    movq 16(%rdi), %r9
+; FALLBACK1-NEXT:    movq 24(%rdi), %r10
+; FALLBACK1-NEXT:    movq 32(%rdi), %r11
+; FALLBACK1-NEXT:    movq 40(%rdi), %rbx
+; FALLBACK1-NEXT:    movq 48(%rdi), %r14
+; FALLBACK1-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK1-NEXT:    movl (%rsi), %eax
+; FALLBACK1-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK1-NEXT:    andl $56, %ecx
+; FALLBACK1-NEXT:    andl $56, %eax
+; FALLBACK1-NEXT:    movq -112(%rsp,%rax), %rdi
+; FALLBACK1-NEXT:    movq -128(%rsp,%rax), %rsi
+; FALLBACK1-NEXT:    movq -120(%rsp,%rax), %r9
+; FALLBACK1-NEXT:    movq %r9, %r8
+; FALLBACK1-NEXT:    shrdq %cl, %rdi, %r8
+; FALLBACK1-NEXT:    movq -96(%rsp,%rax), %r10
+; FALLBACK1-NEXT:    movq -104(%rsp,%rax), %r11
+; FALLBACK1-NEXT:    movq %r11, %rbx
+; FALLBACK1-NEXT:    shrdq %cl, %r10, %rbx
+; FALLBACK1-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK1-NEXT:    movq -80(%rsp,%rax), %r11
+; FALLBACK1-NEXT:    movq -88(%rsp,%rax), %r14
+; FALLBACK1-NEXT:    movq %r14, %r15
+; FALLBACK1-NEXT:    shrdq %cl, %r11, %r15
+; FALLBACK1-NEXT:    shrdq %cl, %r14, %r10
+; FALLBACK1-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK1-NEXT:    shrdq %cl, %rax, %r11
+; FALLBACK1-NEXT:    shrdq %cl, %r9, %rsi
+; FALLBACK1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT:    shrq %cl, %rax
+; FALLBACK1-NEXT:    movq %r11, 48(%rdx)
+; FALLBACK1-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK1-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK1-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK1-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK1-NEXT:    movq %rbx, 24(%rdx)
+; FALLBACK1-NEXT:    movq %rsi, (%rdx)
+; FALLBACK1-NEXT:    movq %r8, 8(%rdx)
+; FALLBACK1-NEXT:    popq %rbx
+; FALLBACK1-NEXT:    popq %r14
+; FALLBACK1-NEXT:    popq %r15
+; FALLBACK1-NEXT:    retq
+;
+; FALLBACK2-LABEL: lshr_64bytes:
+; FALLBACK2:       # %bb.0:
+; FALLBACK2-NEXT:    pushq %rbp
+; FALLBACK2-NEXT:    pushq %r15
+; FALLBACK2-NEXT:    pushq %r14
+; FALLBACK2-NEXT:    pushq %r13
+; FALLBACK2-NEXT:    pushq %r12
+; FALLBACK2-NEXT:    pushq %rbx
+; FALLBACK2-NEXT:    pushq %rax
+; FALLBACK2-NEXT:    movq (%rdi), %rcx
+; FALLBACK2-NEXT:    movq 8(%rdi), %r8
+; FALLBACK2-NEXT:    movq 16(%rdi), %r9
+; FALLBACK2-NEXT:    movq 24(%rdi), %r10
+; FALLBACK2-NEXT:    movq 32(%rdi), %r11
+; FALLBACK2-NEXT:    movq 40(%rdi), %rbx
+; FALLBACK2-NEXT:    movq 48(%rdi), %r14
+; FALLBACK2-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK2-NEXT:    movl (%rsi), %eax
+; FALLBACK2-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK2-NEXT:    andl $56, %ecx
+; FALLBACK2-NEXT:    andl $56, %eax
+; FALLBACK2-NEXT:    movq -120(%rsp,%rax), %rdi
+; FALLBACK2-NEXT:    movq -112(%rsp,%rax), %r9
+; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %rbx
+; FALLBACK2-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r13
+; FALLBACK2-NEXT:    movq -104(%rsp,%rax), %rsi
+; FALLBACK2-NEXT:    shrxq %rcx, %rsi, %r8
+; FALLBACK2-NEXT:    movq -96(%rsp,%rax), %r10
+; FALLBACK2-NEXT:    shrxq %rcx, %r9, %r11
+; FALLBACK2-NEXT:    movq -88(%rsp,%rax), %r14
+; FALLBACK2-NEXT:    shrxq %rcx, %r14, %r15
+; FALLBACK2-NEXT:    shrxq %rcx, %r10, %rbp
+; FALLBACK2-NEXT:    movl %ecx, %r12d
+; FALLBACK2-NEXT:    notb %r12b
+; FALLBACK2-NEXT:    addq %r9, %r9
+; FALLBACK2-NEXT:    shlxq %r12, %r9, %r9
+; FALLBACK2-NEXT:    orq %rbx, %r9
+; FALLBACK2-NEXT:    addq %rdi, %rdi
+; FALLBACK2-NEXT:    shlxq %r12, %rdi, %rdi
+; FALLBACK2-NEXT:    orq %r13, %rdi
+; FALLBACK2-NEXT:    movq -80(%rsp,%rax), %rbx
+; FALLBACK2-NEXT:    shrxq %rcx, %rbx, %r13
+; FALLBACK2-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK2-NEXT:    shrxq %rcx, %rax, %rcx
+; FALLBACK2-NEXT:    addq %r10, %r10
+; FALLBACK2-NEXT:    shlxq %r12, %r10, %r10
+; FALLBACK2-NEXT:    orq %r8, %r10
+; FALLBACK2-NEXT:    addq %rsi, %rsi
+; FALLBACK2-NEXT:    shlxq %r12, %rsi, %rsi
+; FALLBACK2-NEXT:    orq %r11, %rsi
+; FALLBACK2-NEXT:    leaq (%rbx,%rbx), %r8
+; FALLBACK2-NEXT:    shlxq %r12, %r8, %r8
+; FALLBACK2-NEXT:    orq %r15, %r8
+; FALLBACK2-NEXT:    addq %r14, %r14
+; FALLBACK2-NEXT:    shlxq %r12, %r14, %r11
+; FALLBACK2-NEXT:    orq %rbp, %r11
+; FALLBACK2-NEXT:    addq %rax, %rax
+; FALLBACK2-NEXT:    shlxq %r12, %rax, %rax
+; FALLBACK2-NEXT:    orq %r13, %rax
+; FALLBACK2-NEXT:    movq %rcx, 56(%rdx)
+; FALLBACK2-NEXT:    movq %rax, 48(%rdx)
+; FALLBACK2-NEXT:    movq %r11, 32(%rdx)
+; FALLBACK2-NEXT:    movq %r8, 40(%rdx)
+; FALLBACK2-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK2-NEXT:    movq %r10, 24(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, (%rdx)
+; FALLBACK2-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK2-NEXT:    addq $8, %rsp
+; FALLBACK2-NEXT:    popq %rbx
+; FALLBACK2-NEXT:    popq %r12
+; FALLBACK2-NEXT:    popq %r13
+; FALLBACK2-NEXT:    popq %r14
+; FALLBACK2-NEXT:    popq %r15
+; FALLBACK2-NEXT:    popq %rbp
+; FALLBACK2-NEXT:    retq
+;
+; FALLBACK3-LABEL: lshr_64bytes:
+; FALLBACK3:       # %bb.0:
+; FALLBACK3-NEXT:    pushq %r15
+; FALLBACK3-NEXT:    pushq %r14
+; FALLBACK3-NEXT:    pushq %rbx
+; FALLBACK3-NEXT:    movq (%rdi), %rcx
+; FALLBACK3-NEXT:    movq 8(%rdi), %r8
+; FALLBACK3-NEXT:    movq 16(%rdi), %r9
+; FALLBACK3-NEXT:    movq 24(%rdi), %r10
+; FALLBACK3-NEXT:    movq 32(%rdi), %r11
+; FALLBACK3-NEXT:    movq 40(%rdi), %rbx
+; FALLBACK3-NEXT:    movq 48(%rdi), %r14
+; FALLBACK3-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK3-NEXT:    movl (%rsi), %eax
+; FALLBACK3-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK3-NEXT:    andl $56, %ecx
+; FALLBACK3-NEXT:    andl $56, %eax
+; FALLBACK3-NEXT:    movq -112(%rsp,%rax), %rdi
+; FALLBACK3-NEXT:    movq -128(%rsp,%rax), %rsi
+; FALLBACK3-NEXT:    movq -120(%rsp,%rax), %r9
+; FALLBACK3-NEXT:    movq %r9, %r8
+; FALLBACK3-NEXT:    shrdq %cl, %rdi, %r8
+; FALLBACK3-NEXT:    movq -96(%rsp,%rax), %r10
+; FALLBACK3-NEXT:    movq -104(%rsp,%rax), %r11
+; FALLBACK3-NEXT:    movq %r11, %rbx
+; FALLBACK3-NEXT:    shrdq %cl, %r10, %rbx
+; FALLBACK3-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK3-NEXT:    movq -80(%rsp,%rax), %r11
+; FALLBACK3-NEXT:    movq -88(%rsp,%rax), %r14
+; FALLBACK3-NEXT:    movq %r14, %r15
+; FALLBACK3-NEXT:    shrdq %cl, %r11, %r15
+; FALLBACK3-NEXT:    shrdq %cl, %r14, %r10
+; FALLBACK3-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK3-NEXT:    shrdq %cl, %rax, %r11
+; FALLBACK3-NEXT:    shrxq %rcx, %rax, %rax
+; FALLBACK3-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK3-NEXT:    shrdq %cl, %r9, %rsi
+; FALLBACK3-NEXT:    movq %r11, 48(%rdx)
+; FALLBACK3-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK3-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK3-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK3-NEXT:    movq %rbx, 24(%rdx)
+; FALLBACK3-NEXT:    movq %rsi, (%rdx)
+; FALLBACK3-NEXT:    movq %r8, 8(%rdx)
+; FALLBACK3-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK3-NEXT:    popq %rbx
+; FALLBACK3-NEXT:    popq %r14
+; FALLBACK3-NEXT:    popq %r15
+; FALLBACK3-NEXT:    retq
+;
+; FALLBACK4-LABEL: lshr_64bytes:
+; FALLBACK4:       # %bb.0:
+; FALLBACK4-NEXT:    pushq %rbp
+; FALLBACK4-NEXT:    pushq %r15
+; FALLBACK4-NEXT:    pushq %r14
+; FALLBACK4-NEXT:    pushq %r13
+; FALLBACK4-NEXT:    pushq %r12
+; FALLBACK4-NEXT:    pushq %rbx
+; FALLBACK4-NEXT:    pushq %rax
+; FALLBACK4-NEXT:    movups (%rdi), %xmm0
+; FALLBACK4-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK4-NEXT:    movups 48(%rdi), %xmm3
+; FALLBACK4-NEXT:    movl (%rsi), %r8d
+; FALLBACK4-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    leal (,%r8,8), %eax
+; FALLBACK4-NEXT:    andl $56, %eax
+; FALLBACK4-NEXT:    andl $56, %r8d
+; FALLBACK4-NEXT:    movq -128(%rsp,%r8), %r10
+; FALLBACK4-NEXT:    movq -120(%rsp,%r8), %r9
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r10
+; FALLBACK4-NEXT:    movl %eax, %esi
+; FALLBACK4-NEXT:    notb %sil
+; FALLBACK4-NEXT:    leaq (%r9,%r9), %rdi
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %rdi
+; FALLBACK4-NEXT:    orq %r10, %rdi
+; FALLBACK4-NEXT:    movq -104(%rsp,%r8), %r10
+; FALLBACK4-NEXT:    movq %r10, %rbx
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %rbx
+; FALLBACK4-NEXT:    movq -96(%rsp,%r8), %r12
+; FALLBACK4-NEXT:    leaq (%r12,%r12), %r11
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r11
+; FALLBACK4-NEXT:    orq %rbx, %r11
+; FALLBACK4-NEXT:    movq -112(%rsp,%r8), %rbx
+; FALLBACK4-NEXT:    movq %rbx, %r14
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r14
+; FALLBACK4-NEXT:    addq %r10, %r10
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r10
+; FALLBACK4-NEXT:    orq %r14, %r10
+; FALLBACK4-NEXT:    movq -88(%rsp,%r8), %r14
+; FALLBACK4-NEXT:    movq %r14, %r13
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r13
+; FALLBACK4-NEXT:    movq -80(%rsp,%r8), %rbp
+; FALLBACK4-NEXT:    leaq (%rbp,%rbp), %r15
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r15
+; FALLBACK4-NEXT:    orq %r13, %r15
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r12
+; FALLBACK4-NEXT:    addq %r14, %r14
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r14
+; FALLBACK4-NEXT:    orq %r12, %r14
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %rbp
+; FALLBACK4-NEXT:    movq -72(%rsp,%r8), %r8
+; FALLBACK4-NEXT:    leaq (%r8,%r8), %r12
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r12
+; FALLBACK4-NEXT:    orq %rbp, %r12
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r9
+; FALLBACK4-NEXT:    addq %rbx, %rbx
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %rbx
+; FALLBACK4-NEXT:    orq %r9, %rbx
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r8
+; FALLBACK4-NEXT:    movq %r8, 56(%rdx)
+; FALLBACK4-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK4-NEXT:    movq %r12, 48(%rdx)
+; FALLBACK4-NEXT:    movq %r14, 32(%rdx)
+; FALLBACK4-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK4-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK4-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK4-NEXT:    movq %rdi, (%rdx)
+; FALLBACK4-NEXT:    addq $8, %rsp
+; FALLBACK4-NEXT:    popq %rbx
+; FALLBACK4-NEXT:    popq %r12
+; FALLBACK4-NEXT:    popq %r13
+; FALLBACK4-NEXT:    popq %r14
+; FALLBACK4-NEXT:    popq %r15
+; FALLBACK4-NEXT:    popq %rbp
+; FALLBACK4-NEXT:    retq
+;
+; FALLBACK5-LABEL: lshr_64bytes:
+; FALLBACK5:       # %bb.0:
+; FALLBACK5-NEXT:    pushq %r15
+; FALLBACK5-NEXT:    pushq %r14
+; FALLBACK5-NEXT:    pushq %rbx
+; FALLBACK5-NEXT:    movups (%rdi), %xmm0
+; FALLBACK5-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK5-NEXT:    movups 48(%rdi), %xmm3
+; FALLBACK5-NEXT:    movl (%rsi), %eax
+; FALLBACK5-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK5-NEXT:    andl $56, %ecx
+; FALLBACK5-NEXT:    andl $56, %eax
+; FALLBACK5-NEXT:    movq -96(%rsp,%rax), %rdi
+; FALLBACK5-NEXT:    movq -104(%rsp,%rax), %r9
+; FALLBACK5-NEXT:    movq %r9, %rsi
+; FALLBACK5-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK5-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK5-NEXT:    movq %r10, %r8
+; FALLBACK5-NEXT:    shrdq %cl, %r9, %r8
+; FALLBACK5-NEXT:    movq -80(%rsp,%rax), %r9
+; FALLBACK5-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK5-NEXT:    movq %r11, %rbx
+; FALLBACK5-NEXT:    shrdq %cl, %r9, %rbx
+; FALLBACK5-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK5-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK5-NEXT:    shrdq %cl, %r11, %r9
+; FALLBACK5-NEXT:    movq -128(%rsp,%rax), %r14
+; FALLBACK5-NEXT:    movq -120(%rsp,%rax), %rax
+; FALLBACK5-NEXT:    movq %rax, %r15
+; FALLBACK5-NEXT:    shrdq %cl, %r10, %r15
+; FALLBACK5-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK5-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT:    shrq %cl, %r11
+; FALLBACK5-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK5-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK5-NEXT:    movq %r11, 56(%rdx)
+; FALLBACK5-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK5-NEXT:    movq %rbx, 40(%rdx)
+; FALLBACK5-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK5-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT:    movq %r14, (%rdx)
+; FALLBACK5-NEXT:    popq %rbx
+; FALLBACK5-NEXT:    popq %r14
+; FALLBACK5-NEXT:    popq %r15
+; FALLBACK5-NEXT:    retq
+;
+; FALLBACK6-LABEL: lshr_64bytes:
+; FALLBACK6:       # %bb.0:
+; FALLBACK6-NEXT:    pushq %rbp
+; FALLBACK6-NEXT:    pushq %r15
+; FALLBACK6-NEXT:    pushq %r14
+; FALLBACK6-NEXT:    pushq %r13
+; FALLBACK6-NEXT:    pushq %r12
+; FALLBACK6-NEXT:    pushq %rbx
+; FALLBACK6-NEXT:    pushq %rax
+; FALLBACK6-NEXT:    movups (%rdi), %xmm0
+; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK6-NEXT:    movups 48(%rdi), %xmm3
+; FALLBACK6-NEXT:    movl (%rsi), %eax
+; FALLBACK6-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    leal (,%rax,8), %esi
+; FALLBACK6-NEXT:    andl $56, %esi
+; FALLBACK6-NEXT:    andl $56, %eax
+; FALLBACK6-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK6-NEXT:    movq -112(%rsp,%rax), %rcx
+; FALLBACK6-NEXT:    movq -104(%rsp,%rax), %rdi
+; FALLBACK6-NEXT:    shrxq %rsi, %rdi, %r12
+; FALLBACK6-NEXT:    movq -96(%rsp,%rax), %r13
+; FALLBACK6-NEXT:    shrxq %rsi, %rcx, %r9
+; FALLBACK6-NEXT:    movq -88(%rsp,%rax), %r10
+; FALLBACK6-NEXT:    shrxq %rsi, %r10, %r14
+; FALLBACK6-NEXT:    shrxq %rsi, %r13, %r15
+; FALLBACK6-NEXT:    movl %esi, %ebx
+; FALLBACK6-NEXT:    notb %bl
+; FALLBACK6-NEXT:    movq -120(%rsp,%rax), %rbp
+; FALLBACK6-NEXT:    leaq (%rbp,%rbp), %r8
+; FALLBACK6-NEXT:    shlxq %rbx, %r8, %r8
+; FALLBACK6-NEXT:    orq %r11, %r8
+; FALLBACK6-NEXT:    leaq (%r13,%r13), %r11
+; FALLBACK6-NEXT:    shlxq %rbx, %r11, %r11
+; FALLBACK6-NEXT:    orq %r12, %r11
+; FALLBACK6-NEXT:    movq -80(%rsp,%rax), %r12
+; FALLBACK6-NEXT:    shrxq %rsi, %r12, %r13
+; FALLBACK6-NEXT:    shrxq %rsi, %rbp, %rbp
+; FALLBACK6-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK6-NEXT:    shrxq %rsi, %rax, %rsi
+; FALLBACK6-NEXT:    addq %rdi, %rdi
+; FALLBACK6-NEXT:    shlxq %rbx, %rdi, %rdi
+; FALLBACK6-NEXT:    orq %r9, %rdi
+; FALLBACK6-NEXT:    leaq (%r12,%r12), %r9
+; FALLBACK6-NEXT:    shlxq %rbx, %r9, %r9
+; FALLBACK6-NEXT:    orq %r14, %r9
+; FALLBACK6-NEXT:    addq %r10, %r10
+; FALLBACK6-NEXT:    shlxq %rbx, %r10, %r10
+; FALLBACK6-NEXT:    orq %r15, %r10
+; FALLBACK6-NEXT:    addq %rax, %rax
+; FALLBACK6-NEXT:    shlxq %rbx, %rax, %rax
+; FALLBACK6-NEXT:    orq %r13, %rax
+; FALLBACK6-NEXT:    addq %rcx, %rcx
+; FALLBACK6-NEXT:    shlxq %rbx, %rcx, %rcx
+; FALLBACK6-NEXT:    orq %rbp, %rcx
+; FALLBACK6-NEXT:    movq %rsi, 56(%rdx)
+; FALLBACK6-NEXT:    movq %rcx, 8(%rdx)
+; FALLBACK6-NEXT:    movq %rax, 48(%rdx)
+; FALLBACK6-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK6-NEXT:    movq %r9, 40(%rdx)
+; FALLBACK6-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK6-NEXT:    movq %r8, (%rdx)
+; FALLBACK6-NEXT:    addq $8, %rsp
+; FALLBACK6-NEXT:    popq %rbx
+; FALLBACK6-NEXT:    popq %r12
+; FALLBACK6-NEXT:    popq %r13
+; FALLBACK6-NEXT:    popq %r14
+; FALLBACK6-NEXT:    popq %r15
+; FALLBACK6-NEXT:    popq %rbp
+; FALLBACK6-NEXT:    retq
+;
+; FALLBACK7-LABEL: lshr_64bytes:
+; FALLBACK7:       # %bb.0:
+; FALLBACK7-NEXT:    pushq %r15
+; FALLBACK7-NEXT:    pushq %r14
+; FALLBACK7-NEXT:    pushq %rbx
+; FALLBACK7-NEXT:    movups (%rdi), %xmm0
+; FALLBACK7-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK7-NEXT:    movups 48(%rdi), %xmm3
+; FALLBACK7-NEXT:    movl (%rsi), %eax
+; FALLBACK7-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK7-NEXT:    andl $56, %ecx
+; FALLBACK7-NEXT:    andl $56, %eax
+; FALLBACK7-NEXT:    movq -96(%rsp,%rax), %rdi
+; FALLBACK7-NEXT:    movq -104(%rsp,%rax), %r9
+; FALLBACK7-NEXT:    movq %r9, %rsi
+; FALLBACK7-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK7-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK7-NEXT:    movq %r10, %r8
+; FALLBACK7-NEXT:    shrdq %cl, %r9, %r8
+; FALLBACK7-NEXT:    movq -80(%rsp,%rax), %r9
+; FALLBACK7-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK7-NEXT:    movq %r11, %rbx
+; FALLBACK7-NEXT:    shrdq %cl, %r9, %rbx
+; FALLBACK7-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK7-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK7-NEXT:    shrdq %cl, %r11, %r9
+; FALLBACK7-NEXT:    movq -128(%rsp,%rax), %r14
+; FALLBACK7-NEXT:    movq -120(%rsp,%rax), %rax
+; FALLBACK7-NEXT:    movq %rax, %r15
+; FALLBACK7-NEXT:    shrdq %cl, %r10, %r15
+; FALLBACK7-NEXT:    shrxq %rcx, %r11, %r10
+; FALLBACK7-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK7-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK7-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK7-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK7-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK7-NEXT:    movq %rbx, 40(%rdx)
+; FALLBACK7-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK7-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK7-NEXT:    movq %r14, (%rdx)
+; FALLBACK7-NEXT:    movq %r10, 56(%rdx)
+; FALLBACK7-NEXT:    popq %rbx
+; FALLBACK7-NEXT:    popq %r14
+; FALLBACK7-NEXT:    popq %r15
+; FALLBACK7-NEXT:    retq
+;
+; FALLBACK8-LABEL: lshr_64bytes:
+; FALLBACK8:       # %bb.0:
+; FALLBACK8-NEXT:    pushq %rbp
+; FALLBACK8-NEXT:    pushq %r15
+; FALLBACK8-NEXT:    pushq %r14
+; FALLBACK8-NEXT:    pushq %r13
+; FALLBACK8-NEXT:    pushq %r12
+; FALLBACK8-NEXT:    pushq %rbx
+; FALLBACK8-NEXT:    pushq %rax
+; FALLBACK8-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT:    vmovups 32(%rdi), %ymm1
+; FALLBACK8-NEXT:    movl (%rsi), %r9d
+; FALLBACK8-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK8-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    leal (,%r9,8), %eax
+; FALLBACK8-NEXT:    andl $56, %eax
+; FALLBACK8-NEXT:    andl $56, %r9d
+; FALLBACK8-NEXT:    movq -128(%rsp,%r9), %r10
+; FALLBACK8-NEXT:    movq -120(%rsp,%r9), %r8
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r10
+; FALLBACK8-NEXT:    movl %eax, %esi
+; FALLBACK8-NEXT:    notb %sil
+; FALLBACK8-NEXT:    leaq (%r8,%r8), %rdi
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %rdi
+; FALLBACK8-NEXT:    orq %r10, %rdi
+; FALLBACK8-NEXT:    movq -104(%rsp,%r9), %r10
+; FALLBACK8-NEXT:    movq %r10, %rbx
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %rbx
+; FALLBACK8-NEXT:    movq -96(%rsp,%r9), %r12
+; FALLBACK8-NEXT:    leaq (%r12,%r12), %r11
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r11
+; FALLBACK8-NEXT:    orq %rbx, %r11
+; FALLBACK8-NEXT:    movq -112(%rsp,%r9), %rbx
+; FALLBACK8-NEXT:    movq %rbx, %r14
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r14
+; FALLBACK8-NEXT:    addq %r10, %r10
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r10
+; FALLBACK8-NEXT:    orq %r14, %r10
+; FALLBACK8-NEXT:    movq -88(%rsp,%r9), %r14
+; FALLBACK8-NEXT:    movq %r14, %r13
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r13
+; FALLBACK8-NEXT:    movq -80(%rsp,%r9), %rbp
+; FALLBACK8-NEXT:    leaq (%rbp,%rbp), %r15
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r15
+; FALLBACK8-NEXT:    orq %r13, %r15
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r12
+; FALLBACK8-NEXT:    addq %r14, %r14
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r14
+; FALLBACK8-NEXT:    orq %r12, %r14
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %rbp
+; FALLBACK8-NEXT:    movq -72(%rsp,%r9), %r9
+; FALLBACK8-NEXT:    leaq (%r9,%r9), %r12
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r12
+; FALLBACK8-NEXT:    orq %rbp, %r12
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r8
+; FALLBACK8-NEXT:    addq %rbx, %rbx
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %rbx
+; FALLBACK8-NEXT:    orq %r8, %rbx
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r9
+; FALLBACK8-NEXT:    movq %r9, 56(%rdx)
+; FALLBACK8-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK8-NEXT:    movq %r12, 48(%rdx)
+; FALLBACK8-NEXT:    movq %r14, 32(%rdx)
+; FALLBACK8-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK8-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK8-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK8-NEXT:    movq %rdi, (%rdx)
+; FALLBACK8-NEXT:    addq $8, %rsp
+; FALLBACK8-NEXT:    popq %rbx
+; FALLBACK8-NEXT:    popq %r12
+; FALLBACK8-NEXT:    popq %r13
+; FALLBACK8-NEXT:    popq %r14
+; FALLBACK8-NEXT:    popq %r15
+; FALLBACK8-NEXT:    popq %rbp
+; FALLBACK8-NEXT:    vzeroupper
+; FALLBACK8-NEXT:    retq
+;
+; FALLBACK9-LABEL: lshr_64bytes:
+; FALLBACK9:       # %bb.0:
+; FALLBACK9-NEXT:    pushq %r15
+; FALLBACK9-NEXT:    pushq %r14
+; FALLBACK9-NEXT:    pushq %rbx
+; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT:    vmovups 32(%rdi), %ymm1
+; FALLBACK9-NEXT:    movl (%rsi), %eax
+; FALLBACK9-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK9-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK9-NEXT:    andl $56, %ecx
+; FALLBACK9-NEXT:    andl $56, %eax
+; FALLBACK9-NEXT:    movq -96(%rsp,%rax), %rdi
+; FALLBACK9-NEXT:    movq -104(%rsp,%rax), %r9
+; FALLBACK9-NEXT:    movq %r9, %rsi
+; FALLBACK9-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK9-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK9-NEXT:    movq %r10, %r8
+; FALLBACK9-NEXT:    shrdq %cl, %r9, %r8
+; FALLBACK9-NEXT:    movq -80(%rsp,%rax), %r9
+; FALLBACK9-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK9-NEXT:    movq %r11, %rbx
+; FALLBACK9-NEXT:    shrdq %cl, %r9, %rbx
+; FALLBACK9-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK9-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK9-NEXT:    shrdq %cl, %r11, %r9
+; FALLBACK9-NEXT:    movq -128(%rsp,%rax), %r14
+; FALLBACK9-NEXT:    movq -120(%rsp,%rax), %rax
+; FALLBACK9-NEXT:    movq %rax, %r15
+; FALLBACK9-NEXT:    shrdq %cl, %r10, %r15
+; FALLBACK9-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK9-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT:    shrq %cl, %r11
+; FALLBACK9-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK9-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK9-NEXT:    movq %r11, 56(%rdx)
+; FALLBACK9-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK9-NEXT:    movq %rbx, 40(%rdx)
+; FALLBACK9-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK9-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT:    movq %r14, (%rdx)
+; FALLBACK9-NEXT:    popq %rbx
+; FALLBACK9-NEXT:    popq %r14
+; FALLBACK9-NEXT:    popq %r15
+; FALLBACK9-NEXT:    vzeroupper
+; FALLBACK9-NEXT:    retq
+;
+; FALLBACK10-LABEL: lshr_64bytes:
+; FALLBACK10:       # %bb.0:
+; FALLBACK10-NEXT:    pushq %rbp
+; FALLBACK10-NEXT:    pushq %r15
+; FALLBACK10-NEXT:    pushq %r14
+; FALLBACK10-NEXT:    pushq %r13
+; FALLBACK10-NEXT:    pushq %r12
+; FALLBACK10-NEXT:    pushq %rbx
+; FALLBACK10-NEXT:    pushq %rax
+; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT:    vmovups 32(%rdi), %ymm1
+; FALLBACK10-NEXT:    movl (%rsi), %eax
+; FALLBACK10-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK10-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    leal (,%rax,8), %esi
+; FALLBACK10-NEXT:    andl $56, %esi
+; FALLBACK10-NEXT:    andl $56, %eax
+; FALLBACK10-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK10-NEXT:    movq -112(%rsp,%rax), %rcx
+; FALLBACK10-NEXT:    movq -104(%rsp,%rax), %rdi
+; FALLBACK10-NEXT:    shrxq %rsi, %rdi, %r12
+; FALLBACK10-NEXT:    movq -96(%rsp,%rax), %r13
+; FALLBACK10-NEXT:    shrxq %rsi, %rcx, %r9
+; FALLBACK10-NEXT:    movq -88(%rsp,%rax), %r10
+; FALLBACK10-NEXT:    shrxq %rsi, %r10, %r14
+; FALLBACK10-NEXT:    shrxq %rsi, %r13, %r15
+; FALLBACK10-NEXT:    movl %esi, %ebx
+; FALLBACK10-NEXT:    notb %bl
+; FALLBACK10-NEXT:    movq -120(%rsp,%rax), %rbp
+; FALLBACK10-NEXT:    leaq (%rbp,%rbp), %r8
+; FALLBACK10-NEXT:    shlxq %rbx, %r8, %r8
+; FALLBACK10-NEXT:    orq %r11, %r8
+; FALLBACK10-NEXT:    leaq (%r13,%r13), %r11
+; FALLBACK10-NEXT:    shlxq %rbx, %r11, %r11
+; FALLBACK10-NEXT:    orq %r12, %r11
+; FALLBACK10-NEXT:    movq -80(%rsp,%rax), %r12
+; FALLBACK10-NEXT:    shrxq %rsi, %r12, %r13
+; FALLBACK10-NEXT:    shrxq %rsi, %rbp, %rbp
+; FALLBACK10-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK10-NEXT:    shrxq %rsi, %rax, %rsi
+; FALLBACK10-NEXT:    addq %rdi, %rdi
+; FALLBACK10-NEXT:    shlxq %rbx, %rdi, %rdi
+; FALLBACK10-NEXT:    orq %r9, %rdi
+; FALLBACK10-NEXT:    leaq (%r12,%r12), %r9
+; FALLBACK10-NEXT:    shlxq %rbx, %r9, %r9
+; FALLBACK10-NEXT:    orq %r14, %r9
+; FALLBACK10-NEXT:    addq %r10, %r10
+; FALLBACK10-NEXT:    shlxq %rbx, %r10, %r10
+; FALLBACK10-NEXT:    orq %r15, %r10
+; FALLBACK10-NEXT:    addq %rax, %rax
+; FALLBACK10-NEXT:    shlxq %rbx, %rax, %rax
+; FALLBACK10-NEXT:    orq %r13, %rax
+; FALLBACK10-NEXT:    addq %rcx, %rcx
+; FALLBACK10-NEXT:    shlxq %rbx, %rcx, %rcx
+; FALLBACK10-NEXT:    orq %rbp, %rcx
+; FALLBACK10-NEXT:    movq %rsi, 56(%rdx)
+; FALLBACK10-NEXT:    movq %rcx, 8(%rdx)
+; FALLBACK10-NEXT:    movq %rax, 48(%rdx)
+; FALLBACK10-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK10-NEXT:    movq %r9, 40(%rdx)
+; FALLBACK10-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK10-NEXT:    movq %r8, (%rdx)
+; FALLBACK10-NEXT:    addq $8, %rsp
+; FALLBACK10-NEXT:    popq %rbx
+; FALLBACK10-NEXT:    popq %r12
+; FALLBACK10-NEXT:    popq %r13
+; FALLBACK10-NEXT:    popq %r14
+; FALLBACK10-NEXT:    popq %r15
+; FALLBACK10-NEXT:    popq %rbp
+; FALLBACK10-NEXT:    vzeroupper
+; FALLBACK10-NEXT:    retq
+;
+; FALLBACK11-LABEL: lshr_64bytes:
+; FALLBACK11:       # %bb.0:
+; FALLBACK11-NEXT:    pushq %r15
+; FALLBACK11-NEXT:    pushq %r14
+; FALLBACK11-NEXT:    pushq %rbx
+; FALLBACK11-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT:    vmovups 32(%rdi), %ymm1
+; FALLBACK11-NEXT:    movl (%rsi), %eax
+; FALLBACK11-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK11-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK11-NEXT:    andl $56, %ecx
+; FALLBACK11-NEXT:    andl $56, %eax
+; FALLBACK11-NEXT:    movq -96(%rsp,%rax), %rdi
+; FALLBACK11-NEXT:    movq -104(%rsp,%rax), %r9
+; FALLBACK11-NEXT:    movq %r9, %rsi
+; FALLBACK11-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK11-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK11-NEXT:    movq %r10, %r8
+; FALLBACK11-NEXT:    shrdq %cl, %r9, %r8
+; FALLBACK11-NEXT:    movq -80(%rsp,%rax), %r9
+; FALLBACK11-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK11-NEXT:    movq %r11, %rbx
+; FALLBACK11-NEXT:    shrdq %cl, %r9, %rbx
+; FALLBACK11-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK11-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK11-NEXT:    shrdq %cl, %r11, %r9
+; FALLBACK11-NEXT:    movq -128(%rsp,%rax), %r14
+; FALLBACK11-NEXT:    movq -120(%rsp,%rax), %rax
+; FALLBACK11-NEXT:    movq %rax, %r15
+; FALLBACK11-NEXT:    shrdq %cl, %r10, %r15
+; FALLBACK11-NEXT:    shrxq %rcx, %r11, %r10
+; FALLBACK11-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK11-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK11-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK11-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK11-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK11-NEXT:    movq %rbx, 40(%rdx)
+; FALLBACK11-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK11-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK11-NEXT:    movq %r14, (%rdx)
+; FALLBACK11-NEXT:    movq %r10, 56(%rdx)
+; FALLBACK11-NEXT:    popq %rbx
+; FALLBACK11-NEXT:    popq %r14
+; FALLBACK11-NEXT:    popq %r15
+; FALLBACK11-NEXT:    vzeroupper
+; FALLBACK11-NEXT:    retq
+;
+; FALLBACK12-LABEL: lshr_64bytes:
+; FALLBACK12:       # %bb.0:
+; FALLBACK12-NEXT:    pushq %rbp
+; FALLBACK12-NEXT:    pushq %r15
+; FALLBACK12-NEXT:    pushq %r14
+; FALLBACK12-NEXT:    pushq %r13
+; FALLBACK12-NEXT:    pushq %r12
+; FALLBACK12-NEXT:    pushq %rbx
+; FALLBACK12-NEXT:    pushq %rax
+; FALLBACK12-NEXT:    vmovups (%rdi), %zmm0
+; FALLBACK12-NEXT:    movl (%rsi), %r9d
+; FALLBACK12-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    leal (,%r9,8), %eax
+; FALLBACK12-NEXT:    andl $56, %eax
+; FALLBACK12-NEXT:    andl $56, %r9d
+; FALLBACK12-NEXT:    movq -128(%rsp,%r9), %r10
+; FALLBACK12-NEXT:    movq -120(%rsp,%r9), %r8
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r10
+; FALLBACK12-NEXT:    movl %eax, %esi
+; FALLBACK12-NEXT:    notb %sil
+; FALLBACK12-NEXT:    leaq (%r8,%r8), %rdi
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %rdi
+; FALLBACK12-NEXT:    orq %r10, %rdi
+; FALLBACK12-NEXT:    movq -104(%rsp,%r9), %r10
+; FALLBACK12-NEXT:    movq %r10, %rbx
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %rbx
+; FALLBACK12-NEXT:    movq -96(%rsp,%r9), %r12
+; FALLBACK12-NEXT:    leaq (%r12,%r12), %r11
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r11
+; FALLBACK12-NEXT:    orq %rbx, %r11
+; FALLBACK12-NEXT:    movq -112(%rsp,%r9), %rbx
+; FALLBACK12-NEXT:    movq %rbx, %r14
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r14
+; FALLBACK12-NEXT:    addq %r10, %r10
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r10
+; FALLBACK12-NEXT:    orq %r14, %r10
+; FALLBACK12-NEXT:    movq -88(%rsp,%r9), %r14
+; FALLBACK12-NEXT:    movq %r14, %r13
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r13
+; FALLBACK12-NEXT:    movq -80(%rsp,%r9), %rbp
+; FALLBACK12-NEXT:    leaq (%rbp,%rbp), %r15
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r15
+; FALLBACK12-NEXT:    orq %r13, %r15
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r12
+; FALLBACK12-NEXT:    addq %r14, %r14
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r14
+; FALLBACK12-NEXT:    orq %r12, %r14
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %rbp
+; FALLBACK12-NEXT:    movq -72(%rsp,%r9), %r9
+; FALLBACK12-NEXT:    leaq (%r9,%r9), %r12
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r12
+; FALLBACK12-NEXT:    orq %rbp, %r12
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r8
+; FALLBACK12-NEXT:    addq %rbx, %rbx
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %rbx
+; FALLBACK12-NEXT:    orq %r8, %rbx
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r9
+; FALLBACK12-NEXT:    movq %r9, 56(%rdx)
+; FALLBACK12-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK12-NEXT:    movq %r12, 48(%rdx)
+; FALLBACK12-NEXT:    movq %r14, 32(%rdx)
+; FALLBACK12-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK12-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK12-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK12-NEXT:    movq %rdi, (%rdx)
+; FALLBACK12-NEXT:    addq $8, %rsp
+; FALLBACK12-NEXT:    popq %rbx
+; FALLBACK12-NEXT:    popq %r12
+; FALLBACK12-NEXT:    popq %r13
+; FALLBACK12-NEXT:    popq %r14
+; FALLBACK12-NEXT:    popq %r15
+; FALLBACK12-NEXT:    popq %rbp
+; FALLBACK12-NEXT:    vzeroupper
+; FALLBACK12-NEXT:    retq
+;
+; FALLBACK13-LABEL: lshr_64bytes:
+; FALLBACK13:       # %bb.0:
+; FALLBACK13-NEXT:    pushq %r15
+; FALLBACK13-NEXT:    pushq %r14
+; FALLBACK13-NEXT:    pushq %rbx
+; FALLBACK13-NEXT:    vmovups (%rdi), %zmm0
+; FALLBACK13-NEXT:    movl (%rsi), %edi
+; FALLBACK13-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    leal (,%rdi,8), %ecx
+; FALLBACK13-NEXT:    andl $56, %ecx
+; FALLBACK13-NEXT:    andl $56, %edi
+; FALLBACK13-NEXT:    movq -96(%rsp,%rdi), %rsi
+; FALLBACK13-NEXT:    movq -104(%rsp,%rdi), %r9
+; FALLBACK13-NEXT:    movq %r9, %rax
+; FALLBACK13-NEXT:    shrdq %cl, %rsi, %rax
+; FALLBACK13-NEXT:    movq -112(%rsp,%rdi), %r10
+; FALLBACK13-NEXT:    movq %r10, %r8
+; FALLBACK13-NEXT:    shrdq %cl, %r9, %r8
+; FALLBACK13-NEXT:    movq -80(%rsp,%rdi), %r9
+; FALLBACK13-NEXT:    movq -88(%rsp,%rdi), %r11
+; FALLBACK13-NEXT:    movq %r11, %rbx
+; FALLBACK13-NEXT:    shrdq %cl, %r9, %rbx
+; FALLBACK13-NEXT:    shrdq %cl, %r11, %rsi
+; FALLBACK13-NEXT:    movq -72(%rsp,%rdi), %r11
+; FALLBACK13-NEXT:    shrdq %cl, %r11, %r9
+; FALLBACK13-NEXT:    movq -128(%rsp,%rdi), %r14
+; FALLBACK13-NEXT:    movq -120(%rsp,%rdi), %rdi
+; FALLBACK13-NEXT:    movq %rdi, %r15
+; FALLBACK13-NEXT:    shrdq %cl, %r10, %r15
+; FALLBACK13-NEXT:    shrdq %cl, %rdi, %r14
+; FALLBACK13-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT:    shrq %cl, %r11
+; FALLBACK13-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK13-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK13-NEXT:    movq %r11, 56(%rdx)
+; FALLBACK13-NEXT:    movq %rsi, 32(%rdx)
+; FALLBACK13-NEXT:    movq %rbx, 40(%rdx)
+; FALLBACK13-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK13-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK13-NEXT:    movq %r14, (%rdx)
+; FALLBACK13-NEXT:    popq %rbx
+; FALLBACK13-NEXT:    popq %r14
+; FALLBACK13-NEXT:    popq %r15
+; FALLBACK13-NEXT:    vzeroupper
+; FALLBACK13-NEXT:    retq
+;
+; FALLBACK14-LABEL: lshr_64bytes:
+; FALLBACK14:       # %bb.0:
+; FALLBACK14-NEXT:    pushq %rbp
+; FALLBACK14-NEXT:    pushq %r15
+; FALLBACK14-NEXT:    pushq %r14
+; FALLBACK14-NEXT:    pushq %r13
+; FALLBACK14-NEXT:    pushq %r12
+; FALLBACK14-NEXT:    pushq %rbx
+; FALLBACK14-NEXT:    pushq %rax
+; FALLBACK14-NEXT:    vmovups (%rdi), %zmm0
+; FALLBACK14-NEXT:    movl (%rsi), %esi
+; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK14-NEXT:    andl $56, %ecx
+; FALLBACK14-NEXT:    andl $56, %esi
+; FALLBACK14-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %r11
+; FALLBACK14-NEXT:    movq -112(%rsp,%rsi), %rax
+; FALLBACK14-NEXT:    movq -104(%rsp,%rsi), %rdi
+; FALLBACK14-NEXT:    shrxq %rcx, %rdi, %r12
+; FALLBACK14-NEXT:    movq -96(%rsp,%rsi), %r13
+; FALLBACK14-NEXT:    shrxq %rcx, %rax, %r9
+; FALLBACK14-NEXT:    movq -88(%rsp,%rsi), %r10
+; FALLBACK14-NEXT:    shrxq %rcx, %r10, %r14
+; FALLBACK14-NEXT:    shrxq %rcx, %r13, %r15
+; FALLBACK14-NEXT:    movl %ecx, %ebx
+; FALLBACK14-NEXT:    notb %bl
+; FALLBACK14-NEXT:    movq -120(%rsp,%rsi), %rbp
+; FALLBACK14-NEXT:    leaq (%rbp,%rbp), %r8
+; FALLBACK14-NEXT:    shlxq %rbx, %r8, %r8
+; FALLBACK14-NEXT:    orq %r11, %r8
+; FALLBACK14-NEXT:    leaq (%r13,%r13), %r11
+; FALLBACK14-NEXT:    shlxq %rbx, %r11, %r11
+; FALLBACK14-NEXT:    orq %r12, %r11
+; FALLBACK14-NEXT:    movq -80(%rsp,%rsi), %r12
+; FALLBACK14-NEXT:    shrxq %rcx, %r12, %r13
+; FALLBACK14-NEXT:    shrxq %rcx, %rbp, %rbp
+; FALLBACK14-NEXT:    movq -72(%rsp,%rsi), %rsi
+; FALLBACK14-NEXT:    shrxq %rcx, %rsi, %rcx
+; FALLBACK14-NEXT:    addq %rdi, %rdi
+; FALLBACK14-NEXT:    shlxq %rbx, %rdi, %rdi
+; FALLBACK14-NEXT:    orq %r9, %rdi
+; FALLBACK14-NEXT:    leaq (%r12,%r12), %r9
+; FALLBACK14-NEXT:    shlxq %rbx, %r9, %r9
+; FALLBACK14-NEXT:    orq %r14, %r9
+; FALLBACK14-NEXT:    addq %r10, %r10
+; FALLBACK14-NEXT:    shlxq %rbx, %r10, %r10
+; FALLBACK14-NEXT:    orq %r15, %r10
+; FALLBACK14-NEXT:    addq %rsi, %rsi
+; FALLBACK14-NEXT:    shlxq %rbx, %rsi, %rsi
+; FALLBACK14-NEXT:    orq %r13, %rsi
+; FALLBACK14-NEXT:    addq %rax, %rax
+; FALLBACK14-NEXT:    shlxq %rbx, %rax, %rax
+; FALLBACK14-NEXT:    orq %rbp, %rax
+; FALLBACK14-NEXT:    movq %rcx, 56(%rdx)
+; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK14-NEXT:    movq %rsi, 48(%rdx)
+; FALLBACK14-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK14-NEXT:    movq %r9, 40(%rdx)
+; FALLBACK14-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK14-NEXT:    movq %r8, (%rdx)
+; FALLBACK14-NEXT:    addq $8, %rsp
+; FALLBACK14-NEXT:    popq %rbx
+; FALLBACK14-NEXT:    popq %r12
+; FALLBACK14-NEXT:    popq %r13
+; FALLBACK14-NEXT:    popq %r14
+; FALLBACK14-NEXT:    popq %r15
+; FALLBACK14-NEXT:    popq %rbp
+; FALLBACK14-NEXT:    vzeroupper
+; FALLBACK14-NEXT:    retq
+;
+; FALLBACK15-LABEL: lshr_64bytes:
+; FALLBACK15:       # %bb.0:
+; FALLBACK15-NEXT:    pushq %r15
+; FALLBACK15-NEXT:    pushq %r14
+; FALLBACK15-NEXT:    pushq %rbx
+; FALLBACK15-NEXT:    vmovups (%rdi), %zmm0
+; FALLBACK15-NEXT:    movl (%rsi), %eax
+; FALLBACK15-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK15-NEXT:    andl $56, %ecx
+; FALLBACK15-NEXT:    andl $56, %eax
+; FALLBACK15-NEXT:    movq -96(%rsp,%rax), %rdi
+; FALLBACK15-NEXT:    movq -104(%rsp,%rax), %r9
+; FALLBACK15-NEXT:    movq %r9, %rsi
+; FALLBACK15-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK15-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK15-NEXT:    movq %r10, %r8
+; FALLBACK15-NEXT:    shrdq %cl, %r9, %r8
+; FALLBACK15-NEXT:    movq -80(%rsp,%rax), %r9
+; FALLBACK15-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK15-NEXT:    movq %r11, %rbx
+; FALLBACK15-NEXT:    shrdq %cl, %r9, %rbx
+; FALLBACK15-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK15-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK15-NEXT:    shrdq %cl, %r11, %r9
+; FALLBACK15-NEXT:    movq -128(%rsp,%rax), %r14
+; FALLBACK15-NEXT:    movq -120(%rsp,%rax), %rax
+; FALLBACK15-NEXT:    movq %rax, %r15
+; FALLBACK15-NEXT:    shrdq %cl, %r10, %r15
+; FALLBACK15-NEXT:    shrxq %rcx, %r11, %r10
+; FALLBACK15-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK15-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK15-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK15-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK15-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK15-NEXT:    movq %rbx, 40(%rdx)
+; FALLBACK15-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK15-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK15-NEXT:    movq %r14, (%rdx)
+; FALLBACK15-NEXT:    movq %r10, 56(%rdx)
+; FALLBACK15-NEXT:    popq %rbx
+; FALLBACK15-NEXT:    popq %r14
+; FALLBACK15-NEXT:    popq %r15
+; FALLBACK15-NEXT:    vzeroupper
+; FALLBACK15-NEXT:    retq
+;
+; FALLBACK16-LABEL: lshr_64bytes:
+; FALLBACK16:       # %bb.0:
+; FALLBACK16-NEXT:    pushl %ebp
+; FALLBACK16-NEXT:    pushl %ebx
+; FALLBACK16-NEXT:    pushl %edi
+; FALLBACK16-NEXT:    pushl %esi
+; FALLBACK16-NEXT:    subl $204, %esp
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl (%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 4(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 8(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 12(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 16(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 20(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 24(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 28(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 32(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 36(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 40(%eax), %ebp
+; FALLBACK16-NEXT:    movl 44(%eax), %ebx
+; FALLBACK16-NEXT:    movl 48(%eax), %edi
+; FALLBACK16-NEXT:    movl 52(%eax), %esi
+; FALLBACK16-NEXT:    movl 56(%eax), %edx
+; FALLBACK16-NEXT:    movl 60(%eax), %ecx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl (%eax), %eax
+; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, %esi
+; FALLBACK16-NEXT:    andl $60, %esi
+; FALLBACK16-NEXT:    movl 68(%esp,%esi), %edx
+; FALLBACK16-NEXT:    shll $3, %eax
+; FALLBACK16-NEXT:    andl $24, %eax
+; FALLBACK16-NEXT:    movl %edx, %edi
+; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    movl 72(%esp,%esi), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK16-NEXT:    movb %al, %ch
+; FALLBACK16-NEXT:    notb %ch
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %edi, %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 64(%esp,%esi), %edi
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    addl %edx, %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    orl %edi, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 76(%esp,%esi), %edx
+; FALLBACK16-NEXT:    movl %edx, %ebp
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    movl 80(%esp,%esi), %edi
+; FALLBACK16-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %ebp, %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    addl %edx, %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    orl %ebx, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 84(%esp,%esi), %ebx
+; FALLBACK16-NEXT:    movl %ebx, %ebp
+; FALLBACK16-NEXT:    movl %eax, %edx
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    movl 88(%esp,%esi), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    addl %eax, %eax
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    orl %ebp, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    addl %ebx, %ebx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %edi, %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 92(%esp,%esi), %ebx
+; FALLBACK16-NEXT:    movl %ebx, %ebp
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    movl 96(%esp,%esi), %edi
+; FALLBACK16-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    orl %ebp, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    addl %ebx, %ebx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %eax, %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 100(%esp,%esi), %ebx
+; FALLBACK16-NEXT:    movl %ebx, %ebp
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    movl 104(%esp,%esi), %edx
+; FALLBACK16-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    orl %ebp, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    addl %ebx, %ebx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %edi, %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 108(%esp,%esi), %edi
+; FALLBACK16-NEXT:    movl %edi, %ebp
+; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %ebp, %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    addl %edi, %edi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl %edx, %edi
+; FALLBACK16-NEXT:    movl %esi, %edx
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 116(%esp,%esi), %esi
+; FALLBACK16-NEXT:    movl %esi, %ebx
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    movl 120(%esp,%edx), %eax
+; FALLBACK16-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %ebx, %ebp
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    addl %esi, %esi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %ebx, %esi
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    movl 124(%esp,%edx), %ebx
+; FALLBACK16-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    orl %eax, %edx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl %ebx, 60(%eax)
+; FALLBACK16-NEXT:    movl %edx, 56(%eax)
+; FALLBACK16-NEXT:    movl %esi, 48(%eax)
+; FALLBACK16-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK16-NEXT:    movl %edi, 40(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, (%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK16-NEXT:    addl $204, %esp
+; FALLBACK16-NEXT:    popl %esi
+; FALLBACK16-NEXT:    popl %edi
+; FALLBACK16-NEXT:    popl %ebx
+; FALLBACK16-NEXT:    popl %ebp
+; FALLBACK16-NEXT:    retl
+;
+; FALLBACK17-LABEL: lshr_64bytes:
+; FALLBACK17:       # %bb.0:
+; FALLBACK17-NEXT:    pushl %ebp
+; FALLBACK17-NEXT:    pushl %ebx
+; FALLBACK17-NEXT:    pushl %edi
+; FALLBACK17-NEXT:    pushl %esi
+; FALLBACK17-NEXT:    subl $188, %esp
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT:    movl (%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 4(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 8(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 12(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 16(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 20(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 24(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 28(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 32(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 36(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 40(%ecx), %ebp
+; FALLBACK17-NEXT:    movl 44(%ecx), %ebx
+; FALLBACK17-NEXT:    movl 48(%ecx), %edi
+; FALLBACK17-NEXT:    movl 52(%ecx), %esi
+; FALLBACK17-NEXT:    movl 56(%ecx), %edx
+; FALLBACK17-NEXT:    movl 60(%ecx), %eax
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT:    movl (%ecx), %ecx
+; FALLBACK17-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ecx, %ebp
+; FALLBACK17-NEXT:    andl $60, %ebp
+; FALLBACK17-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK17-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shll $3, %ecx
+; FALLBACK17-NEXT:    andl $24, %ecx
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK17-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %esi
+; FALLBACK17-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK17-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edx
+; FALLBACK17-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK17-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edx
+; FALLBACK17-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 88(%esp,%ebp), %esi
+; FALLBACK17-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edx
+; FALLBACK17-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl %esi, %edx
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK17-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edi
+; FALLBACK17-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 104(%esp,%ebp), %edx
+; FALLBACK17-NEXT:    movl 100(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edi
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT:    movl 48(%esp,%ebp), %ebx
+; FALLBACK17-NEXT:    movl 108(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK17-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK17-NEXT:    shrl %cl, %eax
+; FALLBACK17-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK17-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK17-NEXT:    movl %edi, 52(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK17-NEXT:    movl %ebx, (%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK17-NEXT:    addl $188, %esp
+; FALLBACK17-NEXT:    popl %esi
+; FALLBACK17-NEXT:    popl %edi
+; FALLBACK17-NEXT:    popl %ebx
+; FALLBACK17-NEXT:    popl %ebp
+; FALLBACK17-NEXT:    retl
+;
+; FALLBACK18-LABEL: lshr_64bytes:
+; FALLBACK18:       # %bb.0:
+; FALLBACK18-NEXT:    pushl %ebp
+; FALLBACK18-NEXT:    pushl %ebx
+; FALLBACK18-NEXT:    pushl %edi
+; FALLBACK18-NEXT:    pushl %esi
+; FALLBACK18-NEXT:    subl $204, %esp
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl (%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 4(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 8(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 12(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 16(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 20(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 24(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 28(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 32(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 36(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 40(%eax), %ebp
+; FALLBACK18-NEXT:    movl 44(%eax), %ebx
+; FALLBACK18-NEXT:    movl 48(%eax), %edi
+; FALLBACK18-NEXT:    movl 52(%eax), %esi
+; FALLBACK18-NEXT:    movl 56(%eax), %edx
+; FALLBACK18-NEXT:    movl 60(%eax), %ecx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl (%eax), %eax
+; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %eax, %ecx
+; FALLBACK18-NEXT:    leal (,%eax,8), %edx
+; FALLBACK18-NEXT:    andl $24, %edx
+; FALLBACK18-NEXT:    andl $60, %ecx
+; FALLBACK18-NEXT:    movl 68(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    movl 72(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, %esi, %edi
+; FALLBACK18-NEXT:    movl %edx, %ebx
+; FALLBACK18-NEXT:    notb %bl
+; FALLBACK18-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT:    orl %edi, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    addl %esi, %esi
+; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    orl %edi, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 80(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    movl 76(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    orl %eax, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 88(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    movl 84(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    orl %esi, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 96(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    movl 92(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    orl %eax, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 104(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    movl 100(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    orl %esi, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 112(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    movl 108(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    movl %ecx, %edi
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, %esi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT:    addl %esi, %esi
+; FALLBACK18-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK18-NEXT:    orl %ecx, %esi
+; FALLBACK18-NEXT:    movl 120(%esp,%edi), %ebp
+; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT:    shlxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT:    movl 116(%esp,%edi), %eax
+; FALLBACK18-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK18-NEXT:    orl %edi, %ecx
+; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    addl %eax, %eax
+; FALLBACK18-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrxl %edx, %ebp, %eax
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK18-NEXT:    movl 124(%esp,%ebp), %ebp
+; FALLBACK18-NEXT:    shrxl %edx, %ebp, %edx
+; FALLBACK18-NEXT:    addl %ebp, %ebp
+; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %ebx
+; FALLBACK18-NEXT:    orl %eax, %ebx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl %edx, 60(%eax)
+; FALLBACK18-NEXT:    movl %ebx, 56(%eax)
+; FALLBACK18-NEXT:    movl %edi, 48(%eax)
+; FALLBACK18-NEXT:    movl %ecx, 52(%eax)
+; FALLBACK18-NEXT:    movl %esi, 40(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, (%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK18-NEXT:    addl $204, %esp
+; FALLBACK18-NEXT:    popl %esi
+; FALLBACK18-NEXT:    popl %edi
+; FALLBACK18-NEXT:    popl %ebx
+; FALLBACK18-NEXT:    popl %ebp
+; FALLBACK18-NEXT:    retl
+;
+; FALLBACK19-LABEL: lshr_64bytes:
+; FALLBACK19:       # %bb.0:
+; FALLBACK19-NEXT:    pushl %ebp
+; FALLBACK19-NEXT:    pushl %ebx
+; FALLBACK19-NEXT:    pushl %edi
+; FALLBACK19-NEXT:    pushl %esi
+; FALLBACK19-NEXT:    subl $188, %esp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT:    movl (%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 4(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 8(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 12(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 16(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 20(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 24(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 28(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 32(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 36(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 40(%ecx), %ebp
+; FALLBACK19-NEXT:    movl 44(%ecx), %ebx
+; FALLBACK19-NEXT:    movl 48(%ecx), %edi
+; FALLBACK19-NEXT:    movl 52(%ecx), %esi
+; FALLBACK19-NEXT:    movl 56(%ecx), %edx
+; FALLBACK19-NEXT:    movl 60(%ecx), %eax
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT:    movl (%ecx), %ecx
+; FALLBACK19-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ecx, %ebp
+; FALLBACK19-NEXT:    andl $60, %ebp
+; FALLBACK19-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK19-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shll $3, %ecx
+; FALLBACK19-NEXT:    andl $24, %ecx
+; FALLBACK19-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK19-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %esi
+; FALLBACK19-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK19-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK19-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 88(%esp,%ebp), %ebx
+; FALLBACK19-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK19-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK19-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK19-NEXT:    movl 104(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl 100(%esp,%ebp), %edi
+; FALLBACK19-NEXT:    movl %edi, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK19-NEXT:    movl 48(%esp,%ebp), %edi
+; FALLBACK19-NEXT:    movl 108(%esp,%ebp), %ebp
+; FALLBACK19-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %ebp, %eax
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT:    movl %eax, 56(%ebp)
+; FALLBACK19-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK19-NEXT:    movl %edx, 52(%ebp)
+; FALLBACK19-NEXT:    movl %ebx, 40(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK19-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK19-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK19-NEXT:    movl %edi, (%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 4(%ebp)
+; FALLBACK19-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK19-NEXT:    addl $188, %esp
+; FALLBACK19-NEXT:    popl %esi
+; FALLBACK19-NEXT:    popl %edi
+; FALLBACK19-NEXT:    popl %ebx
+; FALLBACK19-NEXT:    popl %ebp
+; FALLBACK19-NEXT:    retl
+;
+; FALLBACK20-LABEL: lshr_64bytes:
+; FALLBACK20:       # %bb.0:
+; FALLBACK20-NEXT:    pushl %ebp
+; FALLBACK20-NEXT:    pushl %ebx
+; FALLBACK20-NEXT:    pushl %edi
+; FALLBACK20-NEXT:    pushl %esi
+; FALLBACK20-NEXT:    subl $204, %esp
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    movups (%ecx), %xmm0
+; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT:    movups 32(%ecx), %xmm2
+; FALLBACK20-NEXT:    movups 48(%ecx), %xmm3
+; FALLBACK20-NEXT:    movl (%eax), %eax
+; FALLBACK20-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, %esi
+; FALLBACK20-NEXT:    andl $60, %esi
+; FALLBACK20-NEXT:    movl 68(%esp,%esi), %edx
+; FALLBACK20-NEXT:    shll $3, %eax
+; FALLBACK20-NEXT:    andl $24, %eax
+; FALLBACK20-NEXT:    movl %edx, %edi
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    movl 72(%esp,%esi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT:    movb %al, %ch
+; FALLBACK20-NEXT:    notb %ch
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %edi, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 64(%esp,%esi), %edi
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    addl %edx, %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    orl %edi, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 76(%esp,%esi), %edx
+; FALLBACK20-NEXT:    movl %edx, %ebp
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 80(%esp,%esi), %edi
+; FALLBACK20-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %ebp, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    addl %edx, %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    orl %ebx, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 84(%esp,%esi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %ebp
+; FALLBACK20-NEXT:    movl %eax, %edx
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 88(%esp,%esi), %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    addl %eax, %eax
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    orl %ebp, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %edi, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 92(%esp,%esi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %ebp
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 96(%esp,%esi), %edi
+; FALLBACK20-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    orl %ebp, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %eax, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 100(%esp,%esi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %ebp
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 104(%esp,%esi), %edx
+; FALLBACK20-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    orl %ebp, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %edi, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 108(%esp,%esi), %edi
+; FALLBACK20-NEXT:    movl %edi, %ebp
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %ebp, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    addl %edi, %edi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    orl %edx, %edi
+; FALLBACK20-NEXT:    movl %esi, %edx
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 116(%esp,%esi), %esi
+; FALLBACK20-NEXT:    movl %esi, %ebx
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    movl 120(%esp,%edx), %eax
+; FALLBACK20-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    orl %ebx, %ebp
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    addl %esi, %esi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl %ebx, %esi
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT:    movl 124(%esp,%edx), %ebx
+; FALLBACK20-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    orl %eax, %edx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl %ebx, 60(%eax)
+; FALLBACK20-NEXT:    movl %edx, 56(%eax)
+; FALLBACK20-NEXT:    movl %esi, 48(%eax)
+; FALLBACK20-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK20-NEXT:    movl %edi, 40(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, (%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK20-NEXT:    addl $204, %esp
+; FALLBACK20-NEXT:    popl %esi
+; FALLBACK20-NEXT:    popl %edi
+; FALLBACK20-NEXT:    popl %ebx
+; FALLBACK20-NEXT:    popl %ebp
+; FALLBACK20-NEXT:    retl
+;
+; FALLBACK21-LABEL: lshr_64bytes:
+; FALLBACK21:       # %bb.0:
+; FALLBACK21-NEXT:    pushl %ebp
+; FALLBACK21-NEXT:    pushl %ebx
+; FALLBACK21-NEXT:    pushl %edi
+; FALLBACK21-NEXT:    pushl %esi
+; FALLBACK21-NEXT:    subl $188, %esp
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT:    movups (%ecx), %xmm0
+; FALLBACK21-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK21-NEXT:    movups 32(%ecx), %xmm2
+; FALLBACK21-NEXT:    movups 48(%ecx), %xmm3
+; FALLBACK21-NEXT:    movl (%eax), %ecx
+; FALLBACK21-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %ecx, %ebp
+; FALLBACK21-NEXT:    andl $60, %ebp
+; FALLBACK21-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK21-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shll $3, %ecx
+; FALLBACK21-NEXT:    andl $24, %ecx
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK21-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK21-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %esi
+; FALLBACK21-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK21-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK21-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 88(%esp,%ebp), %esi
+; FALLBACK21-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl %esi, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK21-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edi
+; FALLBACK21-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 104(%esp,%ebp), %edx
+; FALLBACK21-NEXT:    movl 100(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edi
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT:    movl 48(%esp,%ebp), %ebx
+; FALLBACK21-NEXT:    movl 108(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK21-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK21-NEXT:    shrl %cl, %eax
+; FALLBACK21-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK21-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK21-NEXT:    movl %edi, 52(%ebp)
+; FALLBACK21-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK21-NEXT:    movl %ebx, (%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK21-NEXT:    addl $188, %esp
+; FALLBACK21-NEXT:    popl %esi
+; FALLBACK21-NEXT:    popl %edi
+; FALLBACK21-NEXT:    popl %ebx
+; FALLBACK21-NEXT:    popl %ebp
+; FALLBACK21-NEXT:    retl
+;
+; FALLBACK22-LABEL: lshr_64bytes:
+; FALLBACK22:       # %bb.0:
+; FALLBACK22-NEXT:    pushl %ebp
+; FALLBACK22-NEXT:    pushl %ebx
+; FALLBACK22-NEXT:    pushl %edi
+; FALLBACK22-NEXT:    pushl %esi
+; FALLBACK22-NEXT:    subl $204, %esp
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT:    movups (%ecx), %xmm0
+; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT:    movups 32(%ecx), %xmm2
+; FALLBACK22-NEXT:    movups 48(%ecx), %xmm3
+; FALLBACK22-NEXT:    movl (%eax), %ecx
+; FALLBACK22-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    leal (,%ecx,8), %edx
+; FALLBACK22-NEXT:    andl $24, %edx
+; FALLBACK22-NEXT:    andl $60, %ecx
+; FALLBACK22-NEXT:    movl 68(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    movl 72(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %edi
+; FALLBACK22-NEXT:    movl %edx, %ebx
+; FALLBACK22-NEXT:    notb %bl
+; FALLBACK22-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebp
+; FALLBACK22-NEXT:    orl %edi, %ebp
+; FALLBACK22-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    addl %esi, %esi
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK22-NEXT:    orl %edi, %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 80(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    movl 76(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    orl %eax, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 88(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    movl 84(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    orl %esi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 96(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    movl 92(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    orl %eax, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 104(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    movl 100(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    orl %esi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl %ecx, %eax
+; FALLBACK22-NEXT:    movl 112(%esp,%ecx), %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    leal (%ecx,%ecx), %esi
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %ecx
+; FALLBACK22-NEXT:    movl 108(%esp,%eax), %esi
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT:    addl %esi, %esi
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK22-NEXT:    orl %ecx, %esi
+; FALLBACK22-NEXT:    movl 120(%esp,%eax), %ebp
+; FALLBACK22-NEXT:    leal (%ebp,%ebp), %ecx
+; FALLBACK22-NEXT:    shlxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT:    movl 116(%esp,%eax), %eax
+; FALLBACK22-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK22-NEXT:    orl %edi, %ecx
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %eax, %eax
+; FALLBACK22-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %edx, %ebp, %eax
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK22-NEXT:    movl 124(%esp,%ebp), %ebp
+; FALLBACK22-NEXT:    shrxl %edx, %ebp, %edx
+; FALLBACK22-NEXT:    addl %ebp, %ebp
+; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebx
+; FALLBACK22-NEXT:    orl %eax, %ebx
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl %edx, 60(%eax)
+; FALLBACK22-NEXT:    movl %ebx, 56(%eax)
+; FALLBACK22-NEXT:    movl %edi, 48(%eax)
+; FALLBACK22-NEXT:    movl %ecx, 52(%eax)
+; FALLBACK22-NEXT:    movl %esi, 40(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, (%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK22-NEXT:    addl $204, %esp
+; FALLBACK22-NEXT:    popl %esi
+; FALLBACK22-NEXT:    popl %edi
+; FALLBACK22-NEXT:    popl %ebx
+; FALLBACK22-NEXT:    popl %ebp
+; FALLBACK22-NEXT:    retl
+;
+; FALLBACK23-LABEL: lshr_64bytes:
+; FALLBACK23:       # %bb.0:
+; FALLBACK23-NEXT:    pushl %ebp
+; FALLBACK23-NEXT:    pushl %ebx
+; FALLBACK23-NEXT:    pushl %edi
+; FALLBACK23-NEXT:    pushl %esi
+; FALLBACK23-NEXT:    subl $188, %esp
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT:    movups (%ecx), %xmm0
+; FALLBACK23-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK23-NEXT:    movups 32(%ecx), %xmm2
+; FALLBACK23-NEXT:    movups 48(%ecx), %xmm3
+; FALLBACK23-NEXT:    movl (%eax), %ecx
+; FALLBACK23-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %ecx, %ebp
+; FALLBACK23-NEXT:    andl $60, %ebp
+; FALLBACK23-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK23-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shll $3, %ecx
+; FALLBACK23-NEXT:    andl $24, %ecx
+; FALLBACK23-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK23-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, %esi
+; FALLBACK23-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK23-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK23-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 88(%esp,%ebp), %ebx
+; FALLBACK23-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK23-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK23-NEXT:    movl 104(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl 100(%esp,%ebp), %edi
+; FALLBACK23-NEXT:    movl %edi, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK23-NEXT:    movl 48(%esp,%ebp), %edi
+; FALLBACK23-NEXT:    movl 108(%esp,%ebp), %ebp
+; FALLBACK23-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %ebp, %eax
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK23-NEXT:    movl %eax, 56(%ebp)
+; FALLBACK23-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK23-NEXT:    movl %edx, 52(%ebp)
+; FALLBACK23-NEXT:    movl %ebx, 40(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK23-NEXT:    shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK23-NEXT:    movl %edi, (%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 4(%ebp)
+; FALLBACK23-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK23-NEXT:    addl $188, %esp
+; FALLBACK23-NEXT:    popl %esi
+; FALLBACK23-NEXT:    popl %edi
+; FALLBACK23-NEXT:    popl %ebx
+; FALLBACK23-NEXT:    popl %ebp
+; FALLBACK23-NEXT:    retl
+;
+; FALLBACK24-LABEL: lshr_64bytes:
+; FALLBACK24:       # %bb.0:
+; FALLBACK24-NEXT:    pushl %ebp
+; FALLBACK24-NEXT:    pushl %ebx
+; FALLBACK24-NEXT:    pushl %edi
+; FALLBACK24-NEXT:    pushl %esi
+; FALLBACK24-NEXT:    subl $204, %esp
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT:    vmovups 32(%ecx), %ymm1
+; FALLBACK24-NEXT:    movl (%eax), %ecx
+; FALLBACK24-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK24-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, %esi
+; FALLBACK24-NEXT:    andl $60, %esi
+; FALLBACK24-NEXT:    movl 68(%esp,%esi), %edx
+; FALLBACK24-NEXT:    shll $3, %ecx
+; FALLBACK24-NEXT:    andl $24, %ecx
+; FALLBACK24-NEXT:    movl %edx, %edi
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    movl 72(%esp,%esi), %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    leal (%eax,%eax), %ebx
+; FALLBACK24-NEXT:    movl %ecx, %ebp
+; FALLBACK24-NEXT:    movb %cl, %ch
+; FALLBACK24-NEXT:    notb %ch
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %edi, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 64(%esp,%esi), %edi
+; FALLBACK24-NEXT:    movl %ebp, %eax
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    addl %edx, %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    orl %edi, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 76(%esp,%esi), %edx
+; FALLBACK24-NEXT:    movl %edx, %ebp
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 80(%esp,%esi), %edi
+; FALLBACK24-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %ebp, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    addl %edx, %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    orl %ebx, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 84(%esp,%esi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %ebp
+; FALLBACK24-NEXT:    movl %eax, %edx
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 88(%esp,%esi), %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    addl %eax, %eax
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    orl %ebp, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %edi, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 92(%esp,%esi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %ebp
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 96(%esp,%esi), %edi
+; FALLBACK24-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    orl %ebp, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %eax, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 100(%esp,%esi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %ebp
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 104(%esp,%esi), %edx
+; FALLBACK24-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    orl %ebp, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %edi, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 108(%esp,%esi), %edi
+; FALLBACK24-NEXT:    movl %edi, %ebp
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %ebp, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    addl %edi, %edi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    orl %edx, %edi
+; FALLBACK24-NEXT:    movl %esi, %edx
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 116(%esp,%esi), %esi
+; FALLBACK24-NEXT:    movl %esi, %ebx
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    movl 120(%esp,%edx), %eax
+; FALLBACK24-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    orl %ebx, %ebp
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    addl %esi, %esi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl %ebx, %esi
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    movl 124(%esp,%edx), %ebx
+; FALLBACK24-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    orl %eax, %edx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl %ebx, 60(%eax)
+; FALLBACK24-NEXT:    movl %edx, 56(%eax)
+; FALLBACK24-NEXT:    movl %esi, 48(%eax)
+; FALLBACK24-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK24-NEXT:    movl %edi, 40(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, (%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK24-NEXT:    addl $204, %esp
+; FALLBACK24-NEXT:    popl %esi
+; FALLBACK24-NEXT:    popl %edi
+; FALLBACK24-NEXT:    popl %ebx
+; FALLBACK24-NEXT:    popl %ebp
+; FALLBACK24-NEXT:    vzeroupper
+; FALLBACK24-NEXT:    retl
+;
+; FALLBACK25-LABEL: lshr_64bytes:
+; FALLBACK25:       # %bb.0:
+; FALLBACK25-NEXT:    pushl %ebp
+; FALLBACK25-NEXT:    pushl %ebx
+; FALLBACK25-NEXT:    pushl %edi
+; FALLBACK25-NEXT:    pushl %esi
+; FALLBACK25-NEXT:    subl $188, %esp
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK25-NEXT:    vmovups 32(%ecx), %ymm1
+; FALLBACK25-NEXT:    movl (%eax), %ecx
+; FALLBACK25-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK25-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %ecx, %ebp
+; FALLBACK25-NEXT:    andl $60, %ebp
+; FALLBACK25-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK25-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shll $3, %ecx
+; FALLBACK25-NEXT:    andl $24, %ecx
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK25-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK25-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %esi
+; FALLBACK25-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK25-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK25-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 88(%esp,%ebp), %esi
+; FALLBACK25-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl %esi, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK25-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edi
+; FALLBACK25-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 104(%esp,%ebp), %edx
+; FALLBACK25-NEXT:    movl 100(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edi
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT:    movl 48(%esp,%ebp), %ebx
+; FALLBACK25-NEXT:    movl 108(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK25-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK25-NEXT:    shrl %cl, %eax
+; FALLBACK25-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK25-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK25-NEXT:    movl %edi, 52(%ebp)
+; FALLBACK25-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK25-NEXT:    movl %ebx, (%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK25-NEXT:    addl $188, %esp
+; FALLBACK25-NEXT:    popl %esi
+; FALLBACK25-NEXT:    popl %edi
+; FALLBACK25-NEXT:    popl %ebx
+; FALLBACK25-NEXT:    popl %ebp
+; FALLBACK25-NEXT:    vzeroupper
+; FALLBACK25-NEXT:    retl
+;
+; FALLBACK26-LABEL: lshr_64bytes:
+; FALLBACK26:       # %bb.0:
+; FALLBACK26-NEXT:    pushl %ebp
+; FALLBACK26-NEXT:    pushl %ebx
+; FALLBACK26-NEXT:    pushl %edi
+; FALLBACK26-NEXT:    pushl %esi
+; FALLBACK26-NEXT:    subl $204, %esp
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT:    vmovups 32(%ecx), %ymm1
+; FALLBACK26-NEXT:    movl (%eax), %ecx
+; FALLBACK26-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK26-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    leal (,%ecx,8), %edx
+; FALLBACK26-NEXT:    andl $24, %edx
+; FALLBACK26-NEXT:    andl $60, %ecx
+; FALLBACK26-NEXT:    movl 68(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    movl 72(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %edi
+; FALLBACK26-NEXT:    movl %edx, %ebx
+; FALLBACK26-NEXT:    notb %bl
+; FALLBACK26-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %ebp
+; FALLBACK26-NEXT:    orl %edi, %ebp
+; FALLBACK26-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    addl %esi, %esi
+; FALLBACK26-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK26-NEXT:    orl %edi, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 80(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    movl 76(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    orl %eax, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 88(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    movl 84(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    orl %esi, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 96(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    movl 92(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    orl %eax, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 104(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    movl 100(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    orl %esi, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 112(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK26-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    movl 108(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    addl %esi, %esi
+; FALLBACK26-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK26-NEXT:    orl %eax, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 120(%esp,%ecx), %ebp
+; FALLBACK26-NEXT:    leal (%ebp,%ebp), %eax
+; FALLBACK26-NEXT:    shlxl %ebx, %eax, %esi
+; FALLBACK26-NEXT:    movl 116(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK26-NEXT:    orl %edi, %esi
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %eax, %eax
+; FALLBACK26-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %edx, %ebp, %eax
+; FALLBACK26-NEXT:    movl 124(%esp,%ecx), %ecx
+; FALLBACK26-NEXT:    shrxl %edx, %ecx, %edx
+; FALLBACK26-NEXT:    addl %ecx, %ecx
+; FALLBACK26-NEXT:    shlxl %ebx, %ecx, %ebx
+; FALLBACK26-NEXT:    orl %eax, %ebx
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT:    movl %edx, 60(%ecx)
+; FALLBACK26-NEXT:    movl %ebx, 56(%ecx)
+; FALLBACK26-NEXT:    movl %edi, 48(%ecx)
+; FALLBACK26-NEXT:    movl %esi, 52(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 40(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 44(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 32(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 36(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 16(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, (%ecx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 4(%ecx)
+; FALLBACK26-NEXT:    addl $204, %esp
+; FALLBACK26-NEXT:    popl %esi
+; FALLBACK26-NEXT:    popl %edi
+; FALLBACK26-NEXT:    popl %ebx
+; FALLBACK26-NEXT:    popl %ebp
+; FALLBACK26-NEXT:    vzeroupper
+; FALLBACK26-NEXT:    retl
+;
+; FALLBACK27-LABEL: lshr_64bytes:
+; FALLBACK27:       # %bb.0:
+; FALLBACK27-NEXT:    pushl %ebp
+; FALLBACK27-NEXT:    pushl %ebx
+; FALLBACK27-NEXT:    pushl %edi
+; FALLBACK27-NEXT:    pushl %esi
+; FALLBACK27-NEXT:    subl $188, %esp
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK27-NEXT:    vmovups 32(%ecx), %ymm1
+; FALLBACK27-NEXT:    movl (%eax), %ecx
+; FALLBACK27-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK27-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %ecx, %ebp
+; FALLBACK27-NEXT:    andl $60, %ebp
+; FALLBACK27-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK27-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shll $3, %ecx
+; FALLBACK27-NEXT:    andl $24, %ecx
+; FALLBACK27-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK27-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, %esi
+; FALLBACK27-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK27-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK27-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK27-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 88(%esp,%ebp), %ebx
+; FALLBACK27-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK27-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK27-NEXT:    movl 104(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl 100(%esp,%ebp), %edi
+; FALLBACK27-NEXT:    movl %edi, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK27-NEXT:    movl 48(%esp,%ebp), %edi
+; FALLBACK27-NEXT:    movl 108(%esp,%ebp), %ebp
+; FALLBACK27-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %ebp, %eax
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK27-NEXT:    movl %eax, 56(%ebp)
+; FALLBACK27-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK27-NEXT:    movl %edx, 52(%ebp)
+; FALLBACK27-NEXT:    movl %ebx, 40(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK27-NEXT:    shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK27-NEXT:    movl %edi, (%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 4(%ebp)
+; FALLBACK27-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK27-NEXT:    addl $188, %esp
+; FALLBACK27-NEXT:    popl %esi
+; FALLBACK27-NEXT:    popl %edi
+; FALLBACK27-NEXT:    popl %ebx
+; FALLBACK27-NEXT:    popl %ebp
+; FALLBACK27-NEXT:    vzeroupper
+; FALLBACK27-NEXT:    retl
+;
+; FALLBACK28-LABEL: lshr_64bytes:
+; FALLBACK28:       # %bb.0:
+; FALLBACK28-NEXT:    pushl %ebp
+; FALLBACK28-NEXT:    pushl %ebx
+; FALLBACK28-NEXT:    pushl %edi
+; FALLBACK28-NEXT:    pushl %esi
+; FALLBACK28-NEXT:    subl $204, %esp
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    vmovups (%ecx), %zmm0
+; FALLBACK28-NEXT:    movl (%eax), %ecx
+; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, %esi
+; FALLBACK28-NEXT:    andl $60, %esi
+; FALLBACK28-NEXT:    movl 68(%esp,%esi), %edx
+; FALLBACK28-NEXT:    shll $3, %ecx
+; FALLBACK28-NEXT:    andl $24, %ecx
+; FALLBACK28-NEXT:    movl %edx, %edi
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    movl 72(%esp,%esi), %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    leal (%eax,%eax), %ebx
+; FALLBACK28-NEXT:    movl %ecx, %ebp
+; FALLBACK28-NEXT:    movb %cl, %ch
+; FALLBACK28-NEXT:    notb %ch
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %edi, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 64(%esp,%esi), %edi
+; FALLBACK28-NEXT:    movl %ebp, %eax
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    addl %edx, %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    orl %edi, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 76(%esp,%esi), %edx
+; FALLBACK28-NEXT:    movl %edx, %ebp
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 80(%esp,%esi), %edi
+; FALLBACK28-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %ebp, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    addl %edx, %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    orl %ebx, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 84(%esp,%esi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %ebp
+; FALLBACK28-NEXT:    movl %eax, %edx
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 88(%esp,%esi), %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    addl %eax, %eax
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    orl %ebp, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %edi, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 92(%esp,%esi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %ebp
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 96(%esp,%esi), %edi
+; FALLBACK28-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    orl %ebp, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %eax, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 100(%esp,%esi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %ebp
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 104(%esp,%esi), %edx
+; FALLBACK28-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    orl %ebp, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %edi, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 108(%esp,%esi), %edi
+; FALLBACK28-NEXT:    movl %edi, %ebp
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %ebp, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    addl %edi, %edi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    orl %edx, %edi
+; FALLBACK28-NEXT:    movl %esi, %edx
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 116(%esp,%esi), %esi
+; FALLBACK28-NEXT:    movl %esi, %ebx
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    movl 120(%esp,%edx), %eax
+; FALLBACK28-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    orl %ebx, %ebp
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    addl %esi, %esi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl %ebx, %esi
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    movl 124(%esp,%edx), %ebx
+; FALLBACK28-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    orl %eax, %edx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl %ebx, 60(%eax)
+; FALLBACK28-NEXT:    movl %edx, 56(%eax)
+; FALLBACK28-NEXT:    movl %esi, 48(%eax)
+; FALLBACK28-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK28-NEXT:    movl %edi, 40(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, (%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK28-NEXT:    addl $204, %esp
+; FALLBACK28-NEXT:    popl %esi
+; FALLBACK28-NEXT:    popl %edi
+; FALLBACK28-NEXT:    popl %ebx
+; FALLBACK28-NEXT:    popl %ebp
+; FALLBACK28-NEXT:    vzeroupper
+; FALLBACK28-NEXT:    retl
+;
+; FALLBACK29-LABEL: lshr_64bytes:
+; FALLBACK29:       # %bb.0:
+; FALLBACK29-NEXT:    pushl %ebp
+; FALLBACK29-NEXT:    pushl %ebx
+; FALLBACK29-NEXT:    pushl %edi
+; FALLBACK29-NEXT:    pushl %esi
+; FALLBACK29-NEXT:    subl $188, %esp
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT:    vmovups (%ecx), %zmm0
+; FALLBACK29-NEXT:    movl (%eax), %ecx
+; FALLBACK29-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %ecx, %ebp
+; FALLBACK29-NEXT:    andl $60, %ebp
+; FALLBACK29-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK29-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shll $3, %ecx
+; FALLBACK29-NEXT:    andl $24, %ecx
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK29-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK29-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %esi
+; FALLBACK29-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK29-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK29-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 88(%esp,%ebp), %esi
+; FALLBACK29-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl %esi, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK29-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edi
+; FALLBACK29-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 104(%esp,%ebp), %edx
+; FALLBACK29-NEXT:    movl 100(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edi
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT:    movl 48(%esp,%ebp), %ebx
+; FALLBACK29-NEXT:    movl 108(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK29-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK29-NEXT:    shrl %cl, %eax
+; FALLBACK29-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK29-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK29-NEXT:    movl %edi, 52(%ebp)
+; FALLBACK29-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK29-NEXT:    movl %ebx, (%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK29-NEXT:    addl $188, %esp
+; FALLBACK29-NEXT:    popl %esi
+; FALLBACK29-NEXT:    popl %edi
+; FALLBACK29-NEXT:    popl %ebx
+; FALLBACK29-NEXT:    popl %ebp
+; FALLBACK29-NEXT:    vzeroupper
+; FALLBACK29-NEXT:    retl
+;
+; FALLBACK30-LABEL: lshr_64bytes:
+; FALLBACK30:       # %bb.0:
+; FALLBACK30-NEXT:    pushl %ebp
+; FALLBACK30-NEXT:    pushl %ebx
+; FALLBACK30-NEXT:    pushl %edi
+; FALLBACK30-NEXT:    pushl %esi
+; FALLBACK30-NEXT:    subl $204, %esp
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT:    vmovups (%ecx), %zmm0
+; FALLBACK30-NEXT:    movl (%eax), %edx
+; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    leal (,%edx,8), %ecx
+; FALLBACK30-NEXT:    andl $24, %ecx
+; FALLBACK30-NEXT:    andl $60, %edx
+; FALLBACK30-NEXT:    movl 68(%esp,%edx), %esi
+; FALLBACK30-NEXT:    movl 72(%esp,%edx), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %ecx, %esi, %edi
+; FALLBACK30-NEXT:    movl %ecx, %ebx
+; FALLBACK30-NEXT:    notb %bl
+; FALLBACK30-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %ebp
+; FALLBACK30-NEXT:    orl %edi, %ebp
+; FALLBACK30-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %ecx, 64(%esp,%edx), %edi
+; FALLBACK30-NEXT:    addl %esi, %esi
+; FALLBACK30-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK30-NEXT:    orl %edi, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 80(%esp,%edx), %esi
+; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    movl 76(%esp,%edx), %edi
+; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    orl %eax, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 88(%esp,%edx), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    movl 84(%esp,%edx), %edi
+; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %ecx, %esi, %esi
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    orl %esi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 96(%esp,%edx), %esi
+; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    movl 92(%esp,%edx), %edi
+; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    orl %eax, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 104(%esp,%edx), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    movl 100(%esp,%edx), %edi
+; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %ecx, %esi, %esi
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    orl %esi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 112(%esp,%edx), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK30-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT:    movl 108(%esp,%edx), %esi
+; FALLBACK30-NEXT:    shrxl %ecx, %esi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    addl %esi, %esi
+; FALLBACK30-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK30-NEXT:    orl %eax, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 120(%esp,%edx), %ebp
+; FALLBACK30-NEXT:    leal (%ebp,%ebp), %eax
+; FALLBACK30-NEXT:    shlxl %ebx, %eax, %esi
+; FALLBACK30-NEXT:    movl 116(%esp,%edx), %eax
+; FALLBACK30-NEXT:    shrxl %ecx, %eax, %edi
+; FALLBACK30-NEXT:    orl %edi, %esi
+; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %eax, %eax
+; FALLBACK30-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %ecx, %ebp, %eax
+; FALLBACK30-NEXT:    movl 124(%esp,%edx), %edx
+; FALLBACK30-NEXT:    shrxl %ecx, %edx, %ebp
+; FALLBACK30-NEXT:    leal (%edx,%edx), %ecx
+; FALLBACK30-NEXT:    shlxl %ebx, %ecx, %edx
+; FALLBACK30-NEXT:    orl %eax, %edx
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT:    movl %ebp, 60(%ecx)
+; FALLBACK30-NEXT:    movl %edx, 56(%ecx)
+; FALLBACK30-NEXT:    movl %edi, 48(%ecx)
+; FALLBACK30-NEXT:    movl %esi, 52(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 40(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 44(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 32(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 36(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 16(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, (%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 4(%ecx)
+; FALLBACK30-NEXT:    addl $204, %esp
+; FALLBACK30-NEXT:    popl %esi
+; FALLBACK30-NEXT:    popl %edi
+; FALLBACK30-NEXT:    popl %ebx
+; FALLBACK30-NEXT:    popl %ebp
+; FALLBACK30-NEXT:    vzeroupper
+; FALLBACK30-NEXT:    retl
+;
+; FALLBACK31-LABEL: lshr_64bytes:
+; FALLBACK31:       # %bb.0:
+; FALLBACK31-NEXT:    pushl %ebp
+; FALLBACK31-NEXT:    pushl %ebx
+; FALLBACK31-NEXT:    pushl %edi
+; FALLBACK31-NEXT:    pushl %esi
+; FALLBACK31-NEXT:    subl $188, %esp
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT:    vmovups (%ecx), %zmm0
+; FALLBACK31-NEXT:    movl (%eax), %ecx
+; FALLBACK31-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %ecx, %ebp
+; FALLBACK31-NEXT:    andl $60, %ebp
+; FALLBACK31-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK31-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shll $3, %ecx
+; FALLBACK31-NEXT:    andl $24, %ecx
+; FALLBACK31-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK31-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, %esi
+; FALLBACK31-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK31-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK31-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK31-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 88(%esp,%ebp), %ebx
+; FALLBACK31-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK31-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK31-NEXT:    movl 104(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl 100(%esp,%ebp), %edi
+; FALLBACK31-NEXT:    movl %edi, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK31-NEXT:    movl 48(%esp,%ebp), %edi
+; FALLBACK31-NEXT:    movl 108(%esp,%ebp), %ebp
+; FALLBACK31-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %ebp, %eax
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK31-NEXT:    movl %eax, 56(%ebp)
+; FALLBACK31-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK31-NEXT:    movl %edx, 52(%ebp)
+; FALLBACK31-NEXT:    movl %ebx, 40(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK31-NEXT:    shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK31-NEXT:    movl %edi, (%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 4(%ebp)
+; FALLBACK31-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK31-NEXT:    addl $188, %esp
+; FALLBACK31-NEXT:    popl %esi
+; FALLBACK31-NEXT:    popl %edi
+; FALLBACK31-NEXT:    popl %ebx
+; FALLBACK31-NEXT:    popl %ebp
+; FALLBACK31-NEXT:    vzeroupper
+; FALLBACK31-NEXT:    retl
+  %src = load i512, ptr %src.ptr, align 1
+  %byteOff = load i512, ptr %byteOff.ptr, align 1
+  %bitOff = shl i512 %byteOff, 3
+  %res = lshr i512 %src, %bitOff
+  store i512 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
+; X64-SSE2-LABEL: lshr_64bytes_qwordOff:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    pushq %rbx
 ; X64-SSE2-NEXT:    movq (%rdi), %rax
@@ -1667,6 +15700,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-SSE2-NEXT:    movq 48(%rdi), %rbx
 ; X64-SSE2-NEXT:    movq 56(%rdi), %rdi
 ; X64-SSE2-NEXT:    movl (%rsi), %esi
+; X64-SSE2-NEXT:    xorps %xmm0, %xmm0
+; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
@@ -1675,23 +15713,15 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    andl $63, %esi
-; X64-SSE2-NEXT:    movq -128(%rsp,%rsi), %rax
-; X64-SSE2-NEXT:    movq -120(%rsp,%rsi), %rcx
-; X64-SSE2-NEXT:    movq -104(%rsp,%rsi), %rdi
-; X64-SSE2-NEXT:    movq -112(%rsp,%rsi), %r8
-; X64-SSE2-NEXT:    movq -88(%rsp,%rsi), %r9
-; X64-SSE2-NEXT:    movq -96(%rsp,%rsi), %r10
-; X64-SSE2-NEXT:    movq -72(%rsp,%rsi), %r11
-; X64-SSE2-NEXT:    movq -80(%rsp,%rsi), %rsi
+; X64-SSE2-NEXT:    andl $7, %esi
+; X64-SSE2-NEXT:    movq -128(%rsp,%rsi,8), %rax
+; X64-SSE2-NEXT:    movq -120(%rsp,%rsi,8), %rcx
+; X64-SSE2-NEXT:    movq -104(%rsp,%rsi,8), %rdi
+; X64-SSE2-NEXT:    movq -112(%rsp,%rsi,8), %r8
+; X64-SSE2-NEXT:    movq -88(%rsp,%rsi,8), %r9
+; X64-SSE2-NEXT:    movq -96(%rsp,%rsi,8), %r10
+; X64-SSE2-NEXT:    movq -72(%rsp,%rsi,8), %r11
+; X64-SSE2-NEXT:    movq -80(%rsp,%rsi,8), %rsi
 ; X64-SSE2-NEXT:    movq %rsi, 48(%rdx)
 ; X64-SSE2-NEXT:    movq %r11, 56(%rdx)
 ; X64-SSE2-NEXT:    movq %r10, 32(%rdx)
@@ -1703,35 +15733,38 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-SSE2-NEXT:    popq %rbx
 ; X64-SSE2-NEXT:    retq
 ;
-; X64-SSE42-LABEL: lshr_64bytes:
+; X64-SSE42-LABEL: lshr_64bytes_qwordOff:
 ; X64-SSE42:       # %bb.0:
+; X64-SSE42-NEXT:    pushq %rax
 ; X64-SSE42-NEXT:    movups (%rdi), %xmm0
 ; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
 ; X64-SSE42-NEXT:    movups 32(%rdi), %xmm2
 ; X64-SSE42-NEXT:    movups 48(%rdi), %xmm3
 ; X64-SSE42-NEXT:    movl (%rsi), %eax
 ; X64-SSE42-NEXT:    xorps %xmm4, %xmm4
-; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm3, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    andl $63, %eax
-; X64-SSE42-NEXT:    movups -128(%rsp,%rax), %xmm0
-; X64-SSE42-NEXT:    movups -112(%rsp,%rax), %xmm1
-; X64-SSE42-NEXT:    movups -96(%rsp,%rax), %xmm2
-; X64-SSE42-NEXT:    movups -80(%rsp,%rax), %xmm3
+; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    andl $7, %eax
+; X64-SSE42-NEXT:    movups -128(%rsp,%rax,8), %xmm0
+; X64-SSE42-NEXT:    movups -112(%rsp,%rax,8), %xmm1
+; X64-SSE42-NEXT:    movups -96(%rsp,%rax,8), %xmm2
+; X64-SSE42-NEXT:    movups -80(%rsp,%rax,8), %xmm3
 ; X64-SSE42-NEXT:    movups %xmm3, 48(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm2, 32(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
+; X64-SSE42-NEXT:    popq %rax
 ; X64-SSE42-NEXT:    retq
 ;
-; X64-AVX1-LABEL: lshr_64bytes:
+; X64-AVX1-LABEL: lshr_64bytes_qwordOff:
 ; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    pushq %rax
 ; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
 ; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
 ; X64-AVX1-NEXT:    movl (%rsi), %eax
@@ -1740,44 +15773,47 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
 ; X64-AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; X64-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT:    andl $63, %eax
-; X64-AVX1-NEXT:    vmovups -128(%rsp,%rax), %xmm0
-; X64-AVX1-NEXT:    vmovups -112(%rsp,%rax), %xmm1
-; X64-AVX1-NEXT:    vmovups -96(%rsp,%rax), %xmm2
-; X64-AVX1-NEXT:    vmovups -80(%rsp,%rax), %xmm3
+; X64-AVX1-NEXT:    andl $7, %eax
+; X64-AVX1-NEXT:    vmovups -128(%rsp,%rax,8), %xmm0
+; X64-AVX1-NEXT:    vmovups -112(%rsp,%rax,8), %xmm1
+; X64-AVX1-NEXT:    vmovups -96(%rsp,%rax,8), %xmm2
+; X64-AVX1-NEXT:    vmovups -80(%rsp,%rax,8), %xmm3
 ; X64-AVX1-NEXT:    vmovups %xmm3, 48(%rdx)
 ; X64-AVX1-NEXT:    vmovups %xmm1, 16(%rdx)
 ; X64-AVX1-NEXT:    vmovups %xmm2, 32(%rdx)
 ; X64-AVX1-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX1-NEXT:    popq %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
-; X64-AVX512-LABEL: lshr_64bytes:
+; X64-AVX512-LABEL: lshr_64bytes_qwordOff:
 ; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    pushq %rax
 ; X64-AVX512-NEXT:    vmovups (%rdi), %zmm0
 ; X64-AVX512-NEXT:    movl (%rsi), %eax
 ; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
 ; X64-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT:    andl $63, %eax
-; X64-AVX512-NEXT:    vmovups -128(%rsp,%rax), %xmm0
-; X64-AVX512-NEXT:    vmovups -112(%rsp,%rax), %xmm1
-; X64-AVX512-NEXT:    vmovups -96(%rsp,%rax), %xmm2
-; X64-AVX512-NEXT:    vmovups -80(%rsp,%rax), %xmm3
+; X64-AVX512-NEXT:    andl $7, %eax
+; X64-AVX512-NEXT:    vmovups -128(%rsp,%rax,8), %xmm0
+; X64-AVX512-NEXT:    vmovups -112(%rsp,%rax,8), %xmm1
+; X64-AVX512-NEXT:    vmovups -96(%rsp,%rax,8), %xmm2
+; X64-AVX512-NEXT:    vmovups -80(%rsp,%rax,8), %xmm3
 ; X64-AVX512-NEXT:    vmovups %xmm3, 48(%rdx)
 ; X64-AVX512-NEXT:    vmovups %xmm1, 16(%rdx)
 ; X64-AVX512-NEXT:    vmovups %xmm2, 32(%rdx)
 ; X64-AVX512-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX512-NEXT:    popq %rax
 ; X64-AVX512-NEXT:    vzeroupper
 ; X64-AVX512-NEXT:    retq
 ;
-; X86-SSE2-LABEL: lshr_64bytes:
+; X86-SSE2-LABEL: lshr_64bytes_qwordOff:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    pushl %ebx
 ; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $168, %esp
+; X86-SSE2-NEXT:    subl $188, %esp
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl (%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1798,7 +15834,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    movl 32(%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SSE2-NEXT:    movl 36(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SSE2-NEXT:    movl 40(%eax), %ebp
 ; X86-SSE2-NEXT:    movl 44(%eax), %ebx
 ; X86-SSE2-NEXT:    movl 48(%eax), %edi
@@ -1807,13 +15843,17 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    movl 60(%eax), %ecx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl (%eax), %eax
+; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
@@ -1821,6 +15861,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -1833,49 +15874,33 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andl $63, %eax
-; X86-SSE2-NEXT:    movl 40(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    andl $7, %eax
+; X86-SSE2-NEXT:    movl 48(%esp,%eax,8), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 52(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 44(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 60(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 52(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 56(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 48(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 68(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 60(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 64(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 56(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 76(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 68(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 72(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 64(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 84(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 76(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 80(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 72(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 84(%esp,%eax), %ebp
-; X86-SSE2-NEXT:    movl 80(%esp,%eax), %ebx
-; X86-SSE2-NEXT:    movl 92(%esp,%eax), %edi
-; X86-SSE2-NEXT:    movl 88(%esp,%eax), %esi
-; X86-SSE2-NEXT:    movl 100(%esp,%eax), %edx
-; X86-SSE2-NEXT:    movl 96(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 92(%esp,%eax,8), %ebp
+; X86-SSE2-NEXT:    movl 88(%esp,%eax,8), %ebx
+; X86-SSE2-NEXT:    movl 100(%esp,%eax,8), %edi
+; X86-SSE2-NEXT:    movl 96(%esp,%eax,8), %esi
+; X86-SSE2-NEXT:    movl 108(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl 104(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl %ecx, 56(%eax)
 ; X86-SSE2-NEXT:    movl %edx, 60(%eax)
@@ -1883,7 +15908,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    movl %edi, 52(%eax)
 ; X86-SSE2-NEXT:    movl %ebx, 40(%eax)
 ; X86-SSE2-NEXT:    movl %ebp, 44(%eax)
-; X86-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, 32(%eax)
 ; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, 36(%eax)
@@ -1903,16 +15928,16 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    movl %ecx, (%eax)
 ; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    addl $168, %esp
+; X86-SSE2-NEXT:    addl $188, %esp
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    popl %ebx
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-SSE42-LABEL: lshr_64bytes:
+; X86-SSE42-LABEL: lshr_64bytes_qwordOff:
 ; X86-SSE42:       # %bb.0:
-; X86-SSE42-NEXT:    subl $128, %esp
+; X86-SSE42-NEXT:    subl $140, %esp
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1922,29 +15947,29 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE42-NEXT:    movups 48(%edx), %xmm3
 ; X86-SSE42-NEXT:    movl (%ecx), %ecx
 ; X86-SSE42-NEXT:    xorps %xmm4, %xmm4
-; X86-SSE42-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm3, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm0, (%esp)
-; X86-SSE42-NEXT:    andl $63, %ecx
-; X86-SSE42-NEXT:    movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%esp,%ecx), %xmm1
-; X86-SSE42-NEXT:    movups 32(%esp,%ecx), %xmm2
-; X86-SSE42-NEXT:    movups 48(%esp,%ecx), %xmm3
+; X86-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
+; X86-SSE42-NEXT:    andl $7, %ecx
+; X86-SSE42-NEXT:    movups (%esp,%ecx,8), %xmm0
+; X86-SSE42-NEXT:    movups 16(%esp,%ecx,8), %xmm1
+; X86-SSE42-NEXT:    movups 32(%esp,%ecx,8), %xmm2
+; X86-SSE42-NEXT:    movups 48(%esp,%ecx,8), %xmm3
 ; X86-SSE42-NEXT:    movups %xmm3, 48(%eax)
 ; X86-SSE42-NEXT:    movups %xmm2, 32(%eax)
 ; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
 ; X86-SSE42-NEXT:    movups %xmm0, (%eax)
-; X86-SSE42-NEXT:    addl $128, %esp
+; X86-SSE42-NEXT:    addl $140, %esp
 ; X86-SSE42-NEXT:    retl
 ;
-; X86-AVX1-LABEL: lshr_64bytes:
+; X86-AVX1-LABEL: lshr_64bytes_qwordOff:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    subl $128, %esp
+; X86-AVX1-NEXT:    subl $140, %esp
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1956,22 +15981,22 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
 ; X86-AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; X86-AVX1-NEXT:    vmovups %ymm0, (%esp)
-; X86-AVX1-NEXT:    andl $63, %ecx
-; X86-AVX1-NEXT:    vmovups (%esp,%ecx), %xmm0
-; X86-AVX1-NEXT:    vmovups 16(%esp,%ecx), %xmm1
-; X86-AVX1-NEXT:    vmovups 32(%esp,%ecx), %xmm2
-; X86-AVX1-NEXT:    vmovups 48(%esp,%ecx), %xmm3
+; X86-AVX1-NEXT:    andl $7, %ecx
+; X86-AVX1-NEXT:    vmovups (%esp,%ecx,8), %xmm0
+; X86-AVX1-NEXT:    vmovups 16(%esp,%ecx,8), %xmm1
+; X86-AVX1-NEXT:    vmovups 32(%esp,%ecx,8), %xmm2
+; X86-AVX1-NEXT:    vmovups 48(%esp,%ecx,8), %xmm3
 ; X86-AVX1-NEXT:    vmovups %xmm3, 48(%eax)
 ; X86-AVX1-NEXT:    vmovups %xmm2, 32(%eax)
 ; X86-AVX1-NEXT:    vmovups %xmm1, 16(%eax)
 ; X86-AVX1-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX1-NEXT:    addl $128, %esp
+; X86-AVX1-NEXT:    addl $140, %esp
 ; X86-AVX1-NEXT:    vzeroupper
 ; X86-AVX1-NEXT:    retl
 ;
-; X86-AVX512-LABEL: lshr_64bytes:
+; X86-AVX512-LABEL: lshr_64bytes_qwordOff:
 ; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    subl $128, %esp
+; X86-AVX512-NEXT:    subl $140, %esp
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1980,27 +16005,3801 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX512-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
 ; X86-AVX512-NEXT:    vmovups %zmm0, (%esp)
-; X86-AVX512-NEXT:    andl $63, %ecx
-; X86-AVX512-NEXT:    vmovups (%esp,%ecx), %xmm0
-; X86-AVX512-NEXT:    vmovups 16(%esp,%ecx), %xmm1
-; X86-AVX512-NEXT:    vmovups 32(%esp,%ecx), %xmm2
-; X86-AVX512-NEXT:    vmovups 48(%esp,%ecx), %xmm3
+; X86-AVX512-NEXT:    andl $7, %ecx
+; X86-AVX512-NEXT:    vmovups (%esp,%ecx,8), %xmm0
+; X86-AVX512-NEXT:    vmovups 16(%esp,%ecx,8), %xmm1
+; X86-AVX512-NEXT:    vmovups 32(%esp,%ecx,8), %xmm2
+; X86-AVX512-NEXT:    vmovups 48(%esp,%ecx,8), %xmm3
 ; X86-AVX512-NEXT:    vmovups %xmm3, 48(%eax)
 ; X86-AVX512-NEXT:    vmovups %xmm2, 32(%eax)
 ; X86-AVX512-NEXT:    vmovups %xmm1, 16(%eax)
 ; X86-AVX512-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX512-NEXT:    addl $128, %esp
+; X86-AVX512-NEXT:    addl $140, %esp
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
   %src = load i512, ptr %src.ptr, align 1
-  %byteOff = load i512, ptr %byteOff.ptr, align 1
-  %bitOff = shl i512 %byteOff, 3
+  %qwordOff = load i512, ptr %qwordOff.ptr, align 1
+  %bitOff = shl i512 %qwordOff, 6
   %res = lshr i512 %src, %bitOff
   store i512 %res, ptr %dst, align 1
   ret void
 }
+
 define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: shl_64bytes:
+; FALLBACK0-LABEL: shl_64bytes:
+; FALLBACK0:       # %bb.0:
+; FALLBACK0-NEXT:    pushq %r15
+; FALLBACK0-NEXT:    pushq %r14
+; FALLBACK0-NEXT:    pushq %r13
+; FALLBACK0-NEXT:    pushq %r12
+; FALLBACK0-NEXT:    pushq %rbx
+; FALLBACK0-NEXT:    movq (%rdi), %rax
+; FALLBACK0-NEXT:    movq 8(%rdi), %rcx
+; FALLBACK0-NEXT:    movq 16(%rdi), %r8
+; FALLBACK0-NEXT:    movq 24(%rdi), %r9
+; FALLBACK0-NEXT:    movq 32(%rdi), %r10
+; FALLBACK0-NEXT:    movq 40(%rdi), %r11
+; FALLBACK0-NEXT:    movq 48(%rdi), %rbx
+; FALLBACK0-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK0-NEXT:    movl (%rsi), %esi
+; FALLBACK0-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK0-NEXT:    andl $56, %eax
+; FALLBACK0-NEXT:    andl $56, %esi
+; FALLBACK0-NEXT:    negl %esi
+; FALLBACK0-NEXT:    movslq %esi, %rbx
+; FALLBACK0-NEXT:    movq -64(%rsp,%rbx), %r8
+; FALLBACK0-NEXT:    movq -56(%rsp,%rbx), %rdi
+; FALLBACK0-NEXT:    movq %rdi, %r10
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r10
+; FALLBACK0-NEXT:    movl %eax, %esi
+; FALLBACK0-NEXT:    notb %sil
+; FALLBACK0-NEXT:    movq %r8, %r9
+; FALLBACK0-NEXT:    shrq %r9
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r9
+; FALLBACK0-NEXT:    orq %r10, %r9
+; FALLBACK0-NEXT:    movq -40(%rsp,%rbx), %r10
+; FALLBACK0-NEXT:    movq %r10, %r14
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r14
+; FALLBACK0-NEXT:    movq -48(%rsp,%rbx), %r15
+; FALLBACK0-NEXT:    movq %r15, %r11
+; FALLBACK0-NEXT:    shrq %r11
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r11
+; FALLBACK0-NEXT:    orq %r14, %r11
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r15
+; FALLBACK0-NEXT:    shrq %rdi
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %rdi
+; FALLBACK0-NEXT:    orq %r15, %rdi
+; FALLBACK0-NEXT:    movq -24(%rsp,%rbx), %r14
+; FALLBACK0-NEXT:    movq %r14, %r12
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r12
+; FALLBACK0-NEXT:    movq -32(%rsp,%rbx), %r13
+; FALLBACK0-NEXT:    movq %r13, %r15
+; FALLBACK0-NEXT:    shrq %r15
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r15
+; FALLBACK0-NEXT:    orq %r12, %r15
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r13
+; FALLBACK0-NEXT:    shrq %r10
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r10
+; FALLBACK0-NEXT:    orq %r13, %r10
+; FALLBACK0-NEXT:    movq -8(%rsp,%rbx), %r12
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r12
+; FALLBACK0-NEXT:    movq -16(%rsp,%rbx), %rbx
+; FALLBACK0-NEXT:    movq %rbx, %r13
+; FALLBACK0-NEXT:    shrq %r13
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r13
+; FALLBACK0-NEXT:    orq %r12, %r13
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %rbx
+; FALLBACK0-NEXT:    shrq %r14
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r14
+; FALLBACK0-NEXT:    orq %rbx, %r14
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r8
+; FALLBACK0-NEXT:    movq %r8, (%rdx)
+; FALLBACK0-NEXT:    movq %r14, 48(%rdx)
+; FALLBACK0-NEXT:    movq %r13, 56(%rdx)
+; FALLBACK0-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK0-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK0-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK0-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK0-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK0-NEXT:    popq %rbx
+; FALLBACK0-NEXT:    popq %r12
+; FALLBACK0-NEXT:    popq %r13
+; FALLBACK0-NEXT:    popq %r14
+; FALLBACK0-NEXT:    popq %r15
+; FALLBACK0-NEXT:    retq
+;
+; FALLBACK1-LABEL: shl_64bytes:
+; FALLBACK1:       # %bb.0:
+; FALLBACK1-NEXT:    pushq %r14
+; FALLBACK1-NEXT:    pushq %rbx
+; FALLBACK1-NEXT:    pushq %rax
+; FALLBACK1-NEXT:    movq (%rdi), %rax
+; FALLBACK1-NEXT:    movq 8(%rdi), %rcx
+; FALLBACK1-NEXT:    movq 16(%rdi), %r8
+; FALLBACK1-NEXT:    movq 24(%rdi), %r9
+; FALLBACK1-NEXT:    movq 32(%rdi), %r10
+; FALLBACK1-NEXT:    movq 40(%rdi), %r11
+; FALLBACK1-NEXT:    movq 48(%rdi), %rbx
+; FALLBACK1-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK1-NEXT:    movl (%rsi), %esi
+; FALLBACK1-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK1-NEXT:    andl $56, %ecx
+; FALLBACK1-NEXT:    andl $56, %esi
+; FALLBACK1-NEXT:    negl %esi
+; FALLBACK1-NEXT:    movslq %esi, %r9
+; FALLBACK1-NEXT:    movq -48(%rsp,%r9), %rax
+; FALLBACK1-NEXT:    movq -40(%rsp,%r9), %r10
+; FALLBACK1-NEXT:    movq %r10, %rsi
+; FALLBACK1-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK1-NEXT:    movq -64(%rsp,%r9), %r8
+; FALLBACK1-NEXT:    movq -56(%rsp,%r9), %rdi
+; FALLBACK1-NEXT:    shldq %cl, %rdi, %rax
+; FALLBACK1-NEXT:    movq -32(%rsp,%r9), %r11
+; FALLBACK1-NEXT:    movq -24(%rsp,%r9), %rbx
+; FALLBACK1-NEXT:    movq %rbx, %r14
+; FALLBACK1-NEXT:    shldq %cl, %r11, %r14
+; FALLBACK1-NEXT:    shldq %cl, %r10, %r11
+; FALLBACK1-NEXT:    movq -16(%rsp,%r9), %r10
+; FALLBACK1-NEXT:    movq -8(%rsp,%r9), %r9
+; FALLBACK1-NEXT:    shldq %cl, %r10, %r9
+; FALLBACK1-NEXT:    shldq %cl, %rbx, %r10
+; FALLBACK1-NEXT:    shldq %cl, %r8, %rdi
+; FALLBACK1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT:    shlq %cl, %r8
+; FALLBACK1-NEXT:    movq %r10, 48(%rdx)
+; FALLBACK1-NEXT:    movq %r9, 56(%rdx)
+; FALLBACK1-NEXT:    movq %r11, 32(%rdx)
+; FALLBACK1-NEXT:    movq %r14, 40(%rdx)
+; FALLBACK1-NEXT:    movq %rax, 16(%rdx)
+; FALLBACK1-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK1-NEXT:    movq %r8, (%rdx)
+; FALLBACK1-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK1-NEXT:    addq $8, %rsp
+; FALLBACK1-NEXT:    popq %rbx
+; FALLBACK1-NEXT:    popq %r14
+; FALLBACK1-NEXT:    retq
+;
+; FALLBACK2-LABEL: shl_64bytes:
+; FALLBACK2:       # %bb.0:
+; FALLBACK2-NEXT:    pushq %rbp
+; FALLBACK2-NEXT:    pushq %r15
+; FALLBACK2-NEXT:    pushq %r14
+; FALLBACK2-NEXT:    pushq %r13
+; FALLBACK2-NEXT:    pushq %r12
+; FALLBACK2-NEXT:    pushq %rbx
+; FALLBACK2-NEXT:    pushq %rax
+; FALLBACK2-NEXT:    movq (%rdi), %rax
+; FALLBACK2-NEXT:    movq 8(%rdi), %rcx
+; FALLBACK2-NEXT:    movq 16(%rdi), %r8
+; FALLBACK2-NEXT:    movq 24(%rdi), %r9
+; FALLBACK2-NEXT:    movq 32(%rdi), %r10
+; FALLBACK2-NEXT:    movq 40(%rdi), %r11
+; FALLBACK2-NEXT:    movq 48(%rdi), %rbx
+; FALLBACK2-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK2-NEXT:    movl (%rsi), %esi
+; FALLBACK2-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK2-NEXT:    andl $56, %eax
+; FALLBACK2-NEXT:    andl $56, %esi
+; FALLBACK2-NEXT:    negl %esi
+; FALLBACK2-NEXT:    movslq %esi, %rsi
+; FALLBACK2-NEXT:    movq -64(%rsp,%rsi), %r10
+; FALLBACK2-NEXT:    movq -56(%rsp,%rsi), %rcx
+; FALLBACK2-NEXT:    shlxq %rax, %rcx, %r9
+; FALLBACK2-NEXT:    movq -40(%rsp,%rsi), %rdi
+; FALLBACK2-NEXT:    shlxq %rax, %rdi, %r11
+; FALLBACK2-NEXT:    movq -48(%rsp,%rsi), %r14
+; FALLBACK2-NEXT:    shlxq %rax, %r14, %rbx
+; FALLBACK2-NEXT:    movq -24(%rsp,%rsi), %r8
+; FALLBACK2-NEXT:    shlxq %rax, %r8, %r15
+; FALLBACK2-NEXT:    shlxq %rax, %r10, %r12
+; FALLBACK2-NEXT:    movl %eax, %r13d
+; FALLBACK2-NEXT:    notb %r13b
+; FALLBACK2-NEXT:    shrq %r10
+; FALLBACK2-NEXT:    shrxq %r13, %r10, %r10
+; FALLBACK2-NEXT:    orq %r9, %r10
+; FALLBACK2-NEXT:    movq -32(%rsp,%rsi), %r9
+; FALLBACK2-NEXT:    shlxq %rax, %r9, %rbp
+; FALLBACK2-NEXT:    shrq %r14
+; FALLBACK2-NEXT:    shrxq %r13, %r14, %r14
+; FALLBACK2-NEXT:    orq %r11, %r14
+; FALLBACK2-NEXT:    shlxq %rax, -8(%rsp,%rsi), %r11
+; FALLBACK2-NEXT:    movq -16(%rsp,%rsi), %rsi
+; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rax
+; FALLBACK2-NEXT:    shrq %rcx
+; FALLBACK2-NEXT:    shrxq %r13, %rcx, %rcx
+; FALLBACK2-NEXT:    orq %rbx, %rcx
+; FALLBACK2-NEXT:    shrq %r9
+; FALLBACK2-NEXT:    shrxq %r13, %r9, %r9
+; FALLBACK2-NEXT:    orq %r15, %r9
+; FALLBACK2-NEXT:    shrq %rdi
+; FALLBACK2-NEXT:    shrxq %r13, %rdi, %rdi
+; FALLBACK2-NEXT:    orq %rbp, %rdi
+; FALLBACK2-NEXT:    shrq %rsi
+; FALLBACK2-NEXT:    shrxq %r13, %rsi, %rsi
+; FALLBACK2-NEXT:    orq %r11, %rsi
+; FALLBACK2-NEXT:    shrq %r8
+; FALLBACK2-NEXT:    shrxq %r13, %r8, %r8
+; FALLBACK2-NEXT:    orq %rax, %r8
+; FALLBACK2-NEXT:    movq %r12, (%rdx)
+; FALLBACK2-NEXT:    movq %r8, 48(%rdx)
+; FALLBACK2-NEXT:    movq %rsi, 56(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK2-NEXT:    movq %r9, 40(%rdx)
+; FALLBACK2-NEXT:    movq %rcx, 16(%rdx)
+; FALLBACK2-NEXT:    movq %r14, 24(%rdx)
+; FALLBACK2-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK2-NEXT:    addq $8, %rsp
+; FALLBACK2-NEXT:    popq %rbx
+; FALLBACK2-NEXT:    popq %r12
+; FALLBACK2-NEXT:    popq %r13
+; FALLBACK2-NEXT:    popq %r14
+; FALLBACK2-NEXT:    popq %r15
+; FALLBACK2-NEXT:    popq %rbp
+; FALLBACK2-NEXT:    retq
+;
+; FALLBACK3-LABEL: shl_64bytes:
+; FALLBACK3:       # %bb.0:
+; FALLBACK3-NEXT:    pushq %r14
+; FALLBACK3-NEXT:    pushq %rbx
+; FALLBACK3-NEXT:    pushq %rax
+; FALLBACK3-NEXT:    movq (%rdi), %rax
+; FALLBACK3-NEXT:    movq 8(%rdi), %rcx
+; FALLBACK3-NEXT:    movq 16(%rdi), %r8
+; FALLBACK3-NEXT:    movq 24(%rdi), %r9
+; FALLBACK3-NEXT:    movq 32(%rdi), %r10
+; FALLBACK3-NEXT:    movq 40(%rdi), %r11
+; FALLBACK3-NEXT:    movq 48(%rdi), %rbx
+; FALLBACK3-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK3-NEXT:    movl (%rsi), %esi
+; FALLBACK3-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    leal (,%rsi,8), %ecx
+; FALLBACK3-NEXT:    andl $56, %ecx
+; FALLBACK3-NEXT:    andl $56, %esi
+; FALLBACK3-NEXT:    negl %esi
+; FALLBACK3-NEXT:    movslq %esi, %r8
+; FALLBACK3-NEXT:    movq -48(%rsp,%r8), %rax
+; FALLBACK3-NEXT:    movq -40(%rsp,%r8), %r9
+; FALLBACK3-NEXT:    movq %r9, %rsi
+; FALLBACK3-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK3-NEXT:    movq -64(%rsp,%r8), %r10
+; FALLBACK3-NEXT:    movq -56(%rsp,%r8), %rdi
+; FALLBACK3-NEXT:    shldq %cl, %rdi, %rax
+; FALLBACK3-NEXT:    movq -32(%rsp,%r8), %r11
+; FALLBACK3-NEXT:    movq -24(%rsp,%r8), %rbx
+; FALLBACK3-NEXT:    movq %rbx, %r14
+; FALLBACK3-NEXT:    shldq %cl, %r11, %r14
+; FALLBACK3-NEXT:    shldq %cl, %r9, %r11
+; FALLBACK3-NEXT:    movq -16(%rsp,%r8), %r9
+; FALLBACK3-NEXT:    movq -8(%rsp,%r8), %r8
+; FALLBACK3-NEXT:    shldq %cl, %r9, %r8
+; FALLBACK3-NEXT:    shldq %cl, %rbx, %r9
+; FALLBACK3-NEXT:    shldq %cl, %r10, %rdi
+; FALLBACK3-NEXT:    shlxq %rcx, %r10, %rcx
+; FALLBACK3-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK3-NEXT:    movq %r8, 56(%rdx)
+; FALLBACK3-NEXT:    movq %r11, 32(%rdx)
+; FALLBACK3-NEXT:    movq %r14, 40(%rdx)
+; FALLBACK3-NEXT:    movq %rax, 16(%rdx)
+; FALLBACK3-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK3-NEXT:    movq %rcx, (%rdx)
+; FALLBACK3-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK3-NEXT:    addq $8, %rsp
+; FALLBACK3-NEXT:    popq %rbx
+; FALLBACK3-NEXT:    popq %r14
+; FALLBACK3-NEXT:    retq
+;
+; FALLBACK4-LABEL: shl_64bytes:
+; FALLBACK4:       # %bb.0:
+; FALLBACK4-NEXT:    pushq %r15
+; FALLBACK4-NEXT:    pushq %r14
+; FALLBACK4-NEXT:    pushq %r13
+; FALLBACK4-NEXT:    pushq %r12
+; FALLBACK4-NEXT:    pushq %rbx
+; FALLBACK4-NEXT:    movups (%rdi), %xmm0
+; FALLBACK4-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK4-NEXT:    movups 48(%rdi), %xmm3
+; FALLBACK4-NEXT:    movl (%rsi), %ecx
+; FALLBACK4-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK4-NEXT:    andl $56, %eax
+; FALLBACK4-NEXT:    andl $56, %ecx
+; FALLBACK4-NEXT:    negl %ecx
+; FALLBACK4-NEXT:    movslq %ecx, %r9
+; FALLBACK4-NEXT:    movq -24(%rsp,%r9), %rdi
+; FALLBACK4-NEXT:    movq %rdi, %r10
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r10
+; FALLBACK4-NEXT:    movl %eax, %esi
+; FALLBACK4-NEXT:    notb %sil
+; FALLBACK4-NEXT:    movq -32(%rsp,%r9), %r11
+; FALLBACK4-NEXT:    movq %r11, %r8
+; FALLBACK4-NEXT:    shrq %r8
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r8
+; FALLBACK4-NEXT:    orq %r10, %r8
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r11
+; FALLBACK4-NEXT:    movq -40(%rsp,%r9), %rbx
+; FALLBACK4-NEXT:    movq %rbx, %r10
+; FALLBACK4-NEXT:    shrq %r10
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r10
+; FALLBACK4-NEXT:    orq %r11, %r10
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %rbx
+; FALLBACK4-NEXT:    movq -48(%rsp,%r9), %r15
+; FALLBACK4-NEXT:    movq %r15, %r11
+; FALLBACK4-NEXT:    shrq %r11
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r11
+; FALLBACK4-NEXT:    orq %rbx, %r11
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r15
+; FALLBACK4-NEXT:    movq -64(%rsp,%r9), %r14
+; FALLBACK4-NEXT:    movq -56(%rsp,%r9), %r12
+; FALLBACK4-NEXT:    movq %r12, %rbx
+; FALLBACK4-NEXT:    shrq %rbx
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %rbx
+; FALLBACK4-NEXT:    orq %r15, %rbx
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r12
+; FALLBACK4-NEXT:    movq %r14, %r15
+; FALLBACK4-NEXT:    shrq %r15
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r15
+; FALLBACK4-NEXT:    orq %r12, %r15
+; FALLBACK4-NEXT:    movq -16(%rsp,%r9), %r12
+; FALLBACK4-NEXT:    movq %r12, %r13
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r13
+; FALLBACK4-NEXT:    shrq %rdi
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %rdi
+; FALLBACK4-NEXT:    orq %r13, %rdi
+; FALLBACK4-NEXT:    movq -8(%rsp,%r9), %r9
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r9
+; FALLBACK4-NEXT:    shrq %r12
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r12
+; FALLBACK4-NEXT:    orq %r9, %r12
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r14
+; FALLBACK4-NEXT:    movq %r14, (%rdx)
+; FALLBACK4-NEXT:    movq %r12, 56(%rdx)
+; FALLBACK4-NEXT:    movq %rdi, 48(%rdx)
+; FALLBACK4-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK4-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK4-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK4-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK4-NEXT:    movq %r8, 40(%rdx)
+; FALLBACK4-NEXT:    popq %rbx
+; FALLBACK4-NEXT:    popq %r12
+; FALLBACK4-NEXT:    popq %r13
+; FALLBACK4-NEXT:    popq %r14
+; FALLBACK4-NEXT:    popq %r15
+; FALLBACK4-NEXT:    retq
+;
+; FALLBACK5-LABEL: shl_64bytes:
+; FALLBACK5:       # %bb.0:
+; FALLBACK5-NEXT:    pushq %r15
+; FALLBACK5-NEXT:    pushq %r14
+; FALLBACK5-NEXT:    pushq %rbx
+; FALLBACK5-NEXT:    movups (%rdi), %xmm0
+; FALLBACK5-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK5-NEXT:    movups 48(%rdi), %xmm3
+; FALLBACK5-NEXT:    movl (%rsi), %eax
+; FALLBACK5-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK5-NEXT:    andl $56, %ecx
+; FALLBACK5-NEXT:    andl $56, %eax
+; FALLBACK5-NEXT:    negl %eax
+; FALLBACK5-NEXT:    movslq %eax, %r8
+; FALLBACK5-NEXT:    movq -32(%rsp,%r8), %rax
+; FALLBACK5-NEXT:    movq -24(%rsp,%r8), %r9
+; FALLBACK5-NEXT:    movq %r9, %rsi
+; FALLBACK5-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK5-NEXT:    movq -40(%rsp,%r8), %rdi
+; FALLBACK5-NEXT:    shldq %cl, %rdi, %rax
+; FALLBACK5-NEXT:    movq -48(%rsp,%r8), %r10
+; FALLBACK5-NEXT:    shldq %cl, %r10, %rdi
+; FALLBACK5-NEXT:    movq -64(%rsp,%r8), %r11
+; FALLBACK5-NEXT:    movq -56(%rsp,%r8), %rbx
+; FALLBACK5-NEXT:    shldq %cl, %rbx, %r10
+; FALLBACK5-NEXT:    movq -16(%rsp,%r8), %r14
+; FALLBACK5-NEXT:    movq %r14, %r15
+; FALLBACK5-NEXT:    shldq %cl, %r9, %r15
+; FALLBACK5-NEXT:    movq -8(%rsp,%r8), %r8
+; FALLBACK5-NEXT:    shldq %cl, %r14, %r8
+; FALLBACK5-NEXT:    movq %r11, %r9
+; FALLBACK5-NEXT:    shlq %cl, %r9
+; FALLBACK5-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT:    shldq %cl, %r11, %rbx
+; FALLBACK5-NEXT:    movq %r8, 56(%rdx)
+; FALLBACK5-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK5-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK5-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK5-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK5-NEXT:    movq %rax, 32(%rdx)
+; FALLBACK5-NEXT:    movq %rsi, 40(%rdx)
+; FALLBACK5-NEXT:    movq %r9, (%rdx)
+; FALLBACK5-NEXT:    popq %rbx
+; FALLBACK5-NEXT:    popq %r14
+; FALLBACK5-NEXT:    popq %r15
+; FALLBACK5-NEXT:    retq
+;
+; FALLBACK6-LABEL: shl_64bytes:
+; FALLBACK6:       # %bb.0:
+; FALLBACK6-NEXT:    pushq %rbp
+; FALLBACK6-NEXT:    pushq %r15
+; FALLBACK6-NEXT:    pushq %r14
+; FALLBACK6-NEXT:    pushq %r13
+; FALLBACK6-NEXT:    pushq %r12
+; FALLBACK6-NEXT:    pushq %rbx
+; FALLBACK6-NEXT:    subq $24, %rsp
+; FALLBACK6-NEXT:    movups (%rdi), %xmm0
+; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK6-NEXT:    movups 48(%rdi), %xmm3
+; FALLBACK6-NEXT:    movl (%rsi), %eax
+; FALLBACK6-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm3, (%rsp)
+; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK6-NEXT:    andl $56, %ecx
+; FALLBACK6-NEXT:    andl $56, %eax
+; FALLBACK6-NEXT:    negl %eax
+; FALLBACK6-NEXT:    movslq %eax, %rsi
+; FALLBACK6-NEXT:    movq -8(%rsp,%rsi), %rax
+; FALLBACK6-NEXT:    shlxq %rcx, %rax, %r12
+; FALLBACK6-NEXT:    movq -16(%rsp,%rsi), %rdi
+; FALLBACK6-NEXT:    shlxq %rcx, %rdi, %r15
+; FALLBACK6-NEXT:    movq -24(%rsp,%rsi), %r13
+; FALLBACK6-NEXT:    shlxq %rcx, %r13, %r8
+; FALLBACK6-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; FALLBACK6-NEXT:    movq -32(%rsp,%rsi), %r11
+; FALLBACK6-NEXT:    shlxq %rcx, %r11, %r10
+; FALLBACK6-NEXT:    movq -40(%rsp,%rsi), %r14
+; FALLBACK6-NEXT:    shlxq %rcx, %r14, %rbx
+; FALLBACK6-NEXT:    movl %ecx, %r9d
+; FALLBACK6-NEXT:    notb %r9b
+; FALLBACK6-NEXT:    shrq %rdi
+; FALLBACK6-NEXT:    shrxq %r9, %rdi, %rdi
+; FALLBACK6-NEXT:    orq %r12, %rdi
+; FALLBACK6-NEXT:    movq (%rsp,%rsi), %rbp
+; FALLBACK6-NEXT:    shlxq %rcx, %rbp, %r8
+; FALLBACK6-NEXT:    shrq %r13
+; FALLBACK6-NEXT:    shrxq %r9, %r13, %r12
+; FALLBACK6-NEXT:    orq %r15, %r12
+; FALLBACK6-NEXT:    shlxq %rcx, 8(%rsp,%rsi), %r15
+; FALLBACK6-NEXT:    movq -48(%rsp,%rsi), %rsi
+; FALLBACK6-NEXT:    shlxq %rcx, %rsi, %rcx
+; FALLBACK6-NEXT:    shrq %r11
+; FALLBACK6-NEXT:    shrxq %r9, %r11, %r11
+; FALLBACK6-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; FALLBACK6-NEXT:    shrq %r14
+; FALLBACK6-NEXT:    shrxq %r9, %r14, %r14
+; FALLBACK6-NEXT:    orq %r10, %r14
+; FALLBACK6-NEXT:    shrq %rsi
+; FALLBACK6-NEXT:    shrxq %r9, %rsi, %rsi
+; FALLBACK6-NEXT:    orq %rbx, %rsi
+; FALLBACK6-NEXT:    shrq %rax
+; FALLBACK6-NEXT:    shrxq %r9, %rax, %rax
+; FALLBACK6-NEXT:    orq %r8, %rax
+; FALLBACK6-NEXT:    shrq %rbp
+; FALLBACK6-NEXT:    shrxq %r9, %rbp, %r8
+; FALLBACK6-NEXT:    orq %r15, %r8
+; FALLBACK6-NEXT:    movq %rcx, (%rdx)
+; FALLBACK6-NEXT:    movq %r8, 56(%rdx)
+; FALLBACK6-NEXT:    movq %rax, 48(%rdx)
+; FALLBACK6-NEXT:    movq %rsi, 8(%rdx)
+; FALLBACK6-NEXT:    movq %r14, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK6-NEXT:    movq %r12, 32(%rdx)
+; FALLBACK6-NEXT:    movq %rdi, 40(%rdx)
+; FALLBACK6-NEXT:    addq $24, %rsp
+; FALLBACK6-NEXT:    popq %rbx
+; FALLBACK6-NEXT:    popq %r12
+; FALLBACK6-NEXT:    popq %r13
+; FALLBACK6-NEXT:    popq %r14
+; FALLBACK6-NEXT:    popq %r15
+; FALLBACK6-NEXT:    popq %rbp
+; FALLBACK6-NEXT:    retq
+;
+; FALLBACK7-LABEL: shl_64bytes:
+; FALLBACK7:       # %bb.0:
+; FALLBACK7-NEXT:    pushq %r15
+; FALLBACK7-NEXT:    pushq %r14
+; FALLBACK7-NEXT:    pushq %rbx
+; FALLBACK7-NEXT:    movups (%rdi), %xmm0
+; FALLBACK7-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK7-NEXT:    movups 48(%rdi), %xmm3
+; FALLBACK7-NEXT:    movl (%rsi), %eax
+; FALLBACK7-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK7-NEXT:    andl $56, %ecx
+; FALLBACK7-NEXT:    andl $56, %eax
+; FALLBACK7-NEXT:    negl %eax
+; FALLBACK7-NEXT:    movslq %eax, %r8
+; FALLBACK7-NEXT:    movq -32(%rsp,%r8), %rax
+; FALLBACK7-NEXT:    movq -24(%rsp,%r8), %r9
+; FALLBACK7-NEXT:    movq %r9, %rsi
+; FALLBACK7-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK7-NEXT:    movq -40(%rsp,%r8), %rdi
+; FALLBACK7-NEXT:    shldq %cl, %rdi, %rax
+; FALLBACK7-NEXT:    movq -48(%rsp,%r8), %r10
+; FALLBACK7-NEXT:    shldq %cl, %r10, %rdi
+; FALLBACK7-NEXT:    movq -64(%rsp,%r8), %r11
+; FALLBACK7-NEXT:    movq -56(%rsp,%r8), %rbx
+; FALLBACK7-NEXT:    shldq %cl, %rbx, %r10
+; FALLBACK7-NEXT:    movq -16(%rsp,%r8), %r14
+; FALLBACK7-NEXT:    movq %r14, %r15
+; FALLBACK7-NEXT:    shldq %cl, %r9, %r15
+; FALLBACK7-NEXT:    movq -8(%rsp,%r8), %r8
+; FALLBACK7-NEXT:    shldq %cl, %r14, %r8
+; FALLBACK7-NEXT:    shlxq %rcx, %r11, %r9
+; FALLBACK7-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK7-NEXT:    shldq %cl, %r11, %rbx
+; FALLBACK7-NEXT:    movq %r8, 56(%rdx)
+; FALLBACK7-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK7-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK7-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK7-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK7-NEXT:    movq %rax, 32(%rdx)
+; FALLBACK7-NEXT:    movq %rsi, 40(%rdx)
+; FALLBACK7-NEXT:    movq %r9, (%rdx)
+; FALLBACK7-NEXT:    popq %rbx
+; FALLBACK7-NEXT:    popq %r14
+; FALLBACK7-NEXT:    popq %r15
+; FALLBACK7-NEXT:    retq
+;
+; FALLBACK8-LABEL: shl_64bytes:
+; FALLBACK8:       # %bb.0:
+; FALLBACK8-NEXT:    pushq %r15
+; FALLBACK8-NEXT:    pushq %r14
+; FALLBACK8-NEXT:    pushq %r13
+; FALLBACK8-NEXT:    pushq %r12
+; FALLBACK8-NEXT:    pushq %rbx
+; FALLBACK8-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT:    vmovups 32(%rdi), %ymm1
+; FALLBACK8-NEXT:    movl (%rsi), %ecx
+; FALLBACK8-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK8-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK8-NEXT:    andl $56, %eax
+; FALLBACK8-NEXT:    andl $56, %ecx
+; FALLBACK8-NEXT:    negl %ecx
+; FALLBACK8-NEXT:    movslq %ecx, %r9
+; FALLBACK8-NEXT:    movq -24(%rsp,%r9), %rdi
+; FALLBACK8-NEXT:    movq %rdi, %r10
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r10
+; FALLBACK8-NEXT:    movl %eax, %esi
+; FALLBACK8-NEXT:    notb %sil
+; FALLBACK8-NEXT:    movq -32(%rsp,%r9), %r11
+; FALLBACK8-NEXT:    movq %r11, %r8
+; FALLBACK8-NEXT:    shrq %r8
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r8
+; FALLBACK8-NEXT:    orq %r10, %r8
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r11
+; FALLBACK8-NEXT:    movq -40(%rsp,%r9), %rbx
+; FALLBACK8-NEXT:    movq %rbx, %r10
+; FALLBACK8-NEXT:    shrq %r10
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r10
+; FALLBACK8-NEXT:    orq %r11, %r10
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %rbx
+; FALLBACK8-NEXT:    movq -48(%rsp,%r9), %r15
+; FALLBACK8-NEXT:    movq %r15, %r11
+; FALLBACK8-NEXT:    shrq %r11
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r11
+; FALLBACK8-NEXT:    orq %rbx, %r11
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r15
+; FALLBACK8-NEXT:    movq -64(%rsp,%r9), %r14
+; FALLBACK8-NEXT:    movq -56(%rsp,%r9), %r12
+; FALLBACK8-NEXT:    movq %r12, %rbx
+; FALLBACK8-NEXT:    shrq %rbx
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %rbx
+; FALLBACK8-NEXT:    orq %r15, %rbx
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r12
+; FALLBACK8-NEXT:    movq %r14, %r15
+; FALLBACK8-NEXT:    shrq %r15
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r15
+; FALLBACK8-NEXT:    orq %r12, %r15
+; FALLBACK8-NEXT:    movq -16(%rsp,%r9), %r12
+; FALLBACK8-NEXT:    movq %r12, %r13
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r13
+; FALLBACK8-NEXT:    shrq %rdi
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %rdi
+; FALLBACK8-NEXT:    orq %r13, %rdi
+; FALLBACK8-NEXT:    movq -8(%rsp,%r9), %r9
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r9
+; FALLBACK8-NEXT:    shrq %r12
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r12
+; FALLBACK8-NEXT:    orq %r9, %r12
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r14
+; FALLBACK8-NEXT:    movq %r14, (%rdx)
+; FALLBACK8-NEXT:    movq %r12, 56(%rdx)
+; FALLBACK8-NEXT:    movq %rdi, 48(%rdx)
+; FALLBACK8-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK8-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK8-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK8-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK8-NEXT:    movq %r8, 40(%rdx)
+; FALLBACK8-NEXT:    popq %rbx
+; FALLBACK8-NEXT:    popq %r12
+; FALLBACK8-NEXT:    popq %r13
+; FALLBACK8-NEXT:    popq %r14
+; FALLBACK8-NEXT:    popq %r15
+; FALLBACK8-NEXT:    vzeroupper
+; FALLBACK8-NEXT:    retq
+;
+; FALLBACK9-LABEL: shl_64bytes:
+; FALLBACK9:       # %bb.0:
+; FALLBACK9-NEXT:    pushq %r15
+; FALLBACK9-NEXT:    pushq %r14
+; FALLBACK9-NEXT:    pushq %rbx
+; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT:    vmovups 32(%rdi), %ymm1
+; FALLBACK9-NEXT:    movl (%rsi), %eax
+; FALLBACK9-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK9-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK9-NEXT:    andl $56, %ecx
+; FALLBACK9-NEXT:    andl $56, %eax
+; FALLBACK9-NEXT:    negl %eax
+; FALLBACK9-NEXT:    movslq %eax, %r8
+; FALLBACK9-NEXT:    movq -32(%rsp,%r8), %rax
+; FALLBACK9-NEXT:    movq -24(%rsp,%r8), %r9
+; FALLBACK9-NEXT:    movq %r9, %rsi
+; FALLBACK9-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK9-NEXT:    movq -40(%rsp,%r8), %rdi
+; FALLBACK9-NEXT:    shldq %cl, %rdi, %rax
+; FALLBACK9-NEXT:    movq -48(%rsp,%r8), %r10
+; FALLBACK9-NEXT:    shldq %cl, %r10, %rdi
+; FALLBACK9-NEXT:    movq -64(%rsp,%r8), %r11
+; FALLBACK9-NEXT:    movq -56(%rsp,%r8), %rbx
+; FALLBACK9-NEXT:    shldq %cl, %rbx, %r10
+; FALLBACK9-NEXT:    movq -16(%rsp,%r8), %r14
+; FALLBACK9-NEXT:    movq %r14, %r15
+; FALLBACK9-NEXT:    shldq %cl, %r9, %r15
+; FALLBACK9-NEXT:    movq -8(%rsp,%r8), %r8
+; FALLBACK9-NEXT:    shldq %cl, %r14, %r8
+; FALLBACK9-NEXT:    movq %r11, %r9
+; FALLBACK9-NEXT:    shlq %cl, %r9
+; FALLBACK9-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT:    shldq %cl, %r11, %rbx
+; FALLBACK9-NEXT:    movq %r8, 56(%rdx)
+; FALLBACK9-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK9-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK9-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK9-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK9-NEXT:    movq %rax, 32(%rdx)
+; FALLBACK9-NEXT:    movq %rsi, 40(%rdx)
+; FALLBACK9-NEXT:    movq %r9, (%rdx)
+; FALLBACK9-NEXT:    popq %rbx
+; FALLBACK9-NEXT:    popq %r14
+; FALLBACK9-NEXT:    popq %r15
+; FALLBACK9-NEXT:    vzeroupper
+; FALLBACK9-NEXT:    retq
+;
+; FALLBACK10-LABEL: shl_64bytes:
+; FALLBACK10:       # %bb.0:
+; FALLBACK10-NEXT:    pushq %rbp
+; FALLBACK10-NEXT:    pushq %r15
+; FALLBACK10-NEXT:    pushq %r14
+; FALLBACK10-NEXT:    pushq %r13
+; FALLBACK10-NEXT:    pushq %r12
+; FALLBACK10-NEXT:    pushq %rbx
+; FALLBACK10-NEXT:    subq $24, %rsp
+; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT:    vmovups 32(%rdi), %ymm1
+; FALLBACK10-NEXT:    movl (%rsi), %eax
+; FALLBACK10-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK10-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK10-NEXT:    andl $56, %ecx
+; FALLBACK10-NEXT:    andl $56, %eax
+; FALLBACK10-NEXT:    negl %eax
+; FALLBACK10-NEXT:    movslq %eax, %rsi
+; FALLBACK10-NEXT:    movq -8(%rsp,%rsi), %rax
+; FALLBACK10-NEXT:    shlxq %rcx, %rax, %r12
+; FALLBACK10-NEXT:    movq -16(%rsp,%rsi), %rdi
+; FALLBACK10-NEXT:    shlxq %rcx, %rdi, %r15
+; FALLBACK10-NEXT:    movq -24(%rsp,%rsi), %r13
+; FALLBACK10-NEXT:    shlxq %rcx, %r13, %r8
+; FALLBACK10-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; FALLBACK10-NEXT:    movq -32(%rsp,%rsi), %r11
+; FALLBACK10-NEXT:    shlxq %rcx, %r11, %r10
+; FALLBACK10-NEXT:    movq -40(%rsp,%rsi), %r14
+; FALLBACK10-NEXT:    shlxq %rcx, %r14, %rbx
+; FALLBACK10-NEXT:    movl %ecx, %r9d
+; FALLBACK10-NEXT:    notb %r9b
+; FALLBACK10-NEXT:    shrq %rdi
+; FALLBACK10-NEXT:    shrxq %r9, %rdi, %rdi
+; FALLBACK10-NEXT:    orq %r12, %rdi
+; FALLBACK10-NEXT:    movq (%rsp,%rsi), %rbp
+; FALLBACK10-NEXT:    shlxq %rcx, %rbp, %r8
+; FALLBACK10-NEXT:    shrq %r13
+; FALLBACK10-NEXT:    shrxq %r9, %r13, %r12
+; FALLBACK10-NEXT:    orq %r15, %r12
+; FALLBACK10-NEXT:    shlxq %rcx, 8(%rsp,%rsi), %r15
+; FALLBACK10-NEXT:    movq -48(%rsp,%rsi), %rsi
+; FALLBACK10-NEXT:    shlxq %rcx, %rsi, %rcx
+; FALLBACK10-NEXT:    shrq %r11
+; FALLBACK10-NEXT:    shrxq %r9, %r11, %r11
+; FALLBACK10-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; FALLBACK10-NEXT:    shrq %r14
+; FALLBACK10-NEXT:    shrxq %r9, %r14, %r14
+; FALLBACK10-NEXT:    orq %r10, %r14
+; FALLBACK10-NEXT:    shrq %rsi
+; FALLBACK10-NEXT:    shrxq %r9, %rsi, %rsi
+; FALLBACK10-NEXT:    orq %rbx, %rsi
+; FALLBACK10-NEXT:    shrq %rax
+; FALLBACK10-NEXT:    shrxq %r9, %rax, %rax
+; FALLBACK10-NEXT:    orq %r8, %rax
+; FALLBACK10-NEXT:    shrq %rbp
+; FALLBACK10-NEXT:    shrxq %r9, %rbp, %r8
+; FALLBACK10-NEXT:    orq %r15, %r8
+; FALLBACK10-NEXT:    movq %rcx, (%rdx)
+; FALLBACK10-NEXT:    movq %r8, 56(%rdx)
+; FALLBACK10-NEXT:    movq %rax, 48(%rdx)
+; FALLBACK10-NEXT:    movq %rsi, 8(%rdx)
+; FALLBACK10-NEXT:    movq %r14, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK10-NEXT:    movq %r12, 32(%rdx)
+; FALLBACK10-NEXT:    movq %rdi, 40(%rdx)
+; FALLBACK10-NEXT:    addq $24, %rsp
+; FALLBACK10-NEXT:    popq %rbx
+; FALLBACK10-NEXT:    popq %r12
+; FALLBACK10-NEXT:    popq %r13
+; FALLBACK10-NEXT:    popq %r14
+; FALLBACK10-NEXT:    popq %r15
+; FALLBACK10-NEXT:    popq %rbp
+; FALLBACK10-NEXT:    vzeroupper
+; FALLBACK10-NEXT:    retq
+;
+; FALLBACK11-LABEL: shl_64bytes:
+; FALLBACK11:       # %bb.0:
+; FALLBACK11-NEXT:    pushq %r15
+; FALLBACK11-NEXT:    pushq %r14
+; FALLBACK11-NEXT:    pushq %rbx
+; FALLBACK11-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT:    vmovups 32(%rdi), %ymm1
+; FALLBACK11-NEXT:    movl (%rsi), %eax
+; FALLBACK11-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK11-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK11-NEXT:    andl $56, %ecx
+; FALLBACK11-NEXT:    andl $56, %eax
+; FALLBACK11-NEXT:    negl %eax
+; FALLBACK11-NEXT:    movslq %eax, %r8
+; FALLBACK11-NEXT:    movq -32(%rsp,%r8), %rax
+; FALLBACK11-NEXT:    movq -24(%rsp,%r8), %r9
+; FALLBACK11-NEXT:    movq %r9, %rsi
+; FALLBACK11-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK11-NEXT:    movq -40(%rsp,%r8), %rdi
+; FALLBACK11-NEXT:    shldq %cl, %rdi, %rax
+; FALLBACK11-NEXT:    movq -48(%rsp,%r8), %r10
+; FALLBACK11-NEXT:    shldq %cl, %r10, %rdi
+; FALLBACK11-NEXT:    movq -64(%rsp,%r8), %r11
+; FALLBACK11-NEXT:    movq -56(%rsp,%r8), %rbx
+; FALLBACK11-NEXT:    shldq %cl, %rbx, %r10
+; FALLBACK11-NEXT:    movq -16(%rsp,%r8), %r14
+; FALLBACK11-NEXT:    movq %r14, %r15
+; FALLBACK11-NEXT:    shldq %cl, %r9, %r15
+; FALLBACK11-NEXT:    movq -8(%rsp,%r8), %r8
+; FALLBACK11-NEXT:    shldq %cl, %r14, %r8
+; FALLBACK11-NEXT:    shlxq %rcx, %r11, %r9
+; FALLBACK11-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK11-NEXT:    shldq %cl, %r11, %rbx
+; FALLBACK11-NEXT:    movq %r8, 56(%rdx)
+; FALLBACK11-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK11-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK11-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK11-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK11-NEXT:    movq %rax, 32(%rdx)
+; FALLBACK11-NEXT:    movq %rsi, 40(%rdx)
+; FALLBACK11-NEXT:    movq %r9, (%rdx)
+; FALLBACK11-NEXT:    popq %rbx
+; FALLBACK11-NEXT:    popq %r14
+; FALLBACK11-NEXT:    popq %r15
+; FALLBACK11-NEXT:    vzeroupper
+; FALLBACK11-NEXT:    retq
+;
+; FALLBACK12-LABEL: shl_64bytes:
+; FALLBACK12:       # %bb.0:
+; FALLBACK12-NEXT:    pushq %r15
+; FALLBACK12-NEXT:    pushq %r14
+; FALLBACK12-NEXT:    pushq %r13
+; FALLBACK12-NEXT:    pushq %r12
+; FALLBACK12-NEXT:    pushq %rbx
+; FALLBACK12-NEXT:    vmovups (%rdi), %zmm0
+; FALLBACK12-NEXT:    movl (%rsi), %ecx
+; FALLBACK12-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK12-NEXT:    andl $56, %eax
+; FALLBACK12-NEXT:    andl $56, %ecx
+; FALLBACK12-NEXT:    negl %ecx
+; FALLBACK12-NEXT:    movslq %ecx, %r9
+; FALLBACK12-NEXT:    movq -24(%rsp,%r9), %rdi
+; FALLBACK12-NEXT:    movq %rdi, %r10
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r10
+; FALLBACK12-NEXT:    movl %eax, %esi
+; FALLBACK12-NEXT:    notb %sil
+; FALLBACK12-NEXT:    movq -32(%rsp,%r9), %r11
+; FALLBACK12-NEXT:    movq %r11, %r8
+; FALLBACK12-NEXT:    shrq %r8
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r8
+; FALLBACK12-NEXT:    orq %r10, %r8
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r11
+; FALLBACK12-NEXT:    movq -40(%rsp,%r9), %rbx
+; FALLBACK12-NEXT:    movq %rbx, %r10
+; FALLBACK12-NEXT:    shrq %r10
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r10
+; FALLBACK12-NEXT:    orq %r11, %r10
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %rbx
+; FALLBACK12-NEXT:    movq -48(%rsp,%r9), %r15
+; FALLBACK12-NEXT:    movq %r15, %r11
+; FALLBACK12-NEXT:    shrq %r11
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r11
+; FALLBACK12-NEXT:    orq %rbx, %r11
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r15
+; FALLBACK12-NEXT:    movq -64(%rsp,%r9), %r14
+; FALLBACK12-NEXT:    movq -56(%rsp,%r9), %r12
+; FALLBACK12-NEXT:    movq %r12, %rbx
+; FALLBACK12-NEXT:    shrq %rbx
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %rbx
+; FALLBACK12-NEXT:    orq %r15, %rbx
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r12
+; FALLBACK12-NEXT:    movq %r14, %r15
+; FALLBACK12-NEXT:    shrq %r15
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r15
+; FALLBACK12-NEXT:    orq %r12, %r15
+; FALLBACK12-NEXT:    movq -16(%rsp,%r9), %r12
+; FALLBACK12-NEXT:    movq %r12, %r13
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r13
+; FALLBACK12-NEXT:    shrq %rdi
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %rdi
+; FALLBACK12-NEXT:    orq %r13, %rdi
+; FALLBACK12-NEXT:    movq -8(%rsp,%r9), %r9
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r9
+; FALLBACK12-NEXT:    shrq %r12
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r12
+; FALLBACK12-NEXT:    orq %r9, %r12
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r14
+; FALLBACK12-NEXT:    movq %r14, (%rdx)
+; FALLBACK12-NEXT:    movq %r12, 56(%rdx)
+; FALLBACK12-NEXT:    movq %rdi, 48(%rdx)
+; FALLBACK12-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK12-NEXT:    movq %rbx, 16(%rdx)
+; FALLBACK12-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK12-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK12-NEXT:    movq %r8, 40(%rdx)
+; FALLBACK12-NEXT:    popq %rbx
+; FALLBACK12-NEXT:    popq %r12
+; FALLBACK12-NEXT:    popq %r13
+; FALLBACK12-NEXT:    popq %r14
+; FALLBACK12-NEXT:    popq %r15
+; FALLBACK12-NEXT:    vzeroupper
+; FALLBACK12-NEXT:    retq
+;
+; FALLBACK13-LABEL: shl_64bytes:
+; FALLBACK13:       # %bb.0:
+; FALLBACK13-NEXT:    pushq %r15
+; FALLBACK13-NEXT:    pushq %r14
+; FALLBACK13-NEXT:    pushq %rbx
+; FALLBACK13-NEXT:    vmovups (%rdi), %zmm0
+; FALLBACK13-NEXT:    movl (%rsi), %eax
+; FALLBACK13-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK13-NEXT:    andl $56, %ecx
+; FALLBACK13-NEXT:    andl $56, %eax
+; FALLBACK13-NEXT:    negl %eax
+; FALLBACK13-NEXT:    movslq %eax, %r8
+; FALLBACK13-NEXT:    movq -32(%rsp,%r8), %rax
+; FALLBACK13-NEXT:    movq -24(%rsp,%r8), %r9
+; FALLBACK13-NEXT:    movq %r9, %rsi
+; FALLBACK13-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK13-NEXT:    movq -40(%rsp,%r8), %rdi
+; FALLBACK13-NEXT:    shldq %cl, %rdi, %rax
+; FALLBACK13-NEXT:    movq -48(%rsp,%r8), %r10
+; FALLBACK13-NEXT:    shldq %cl, %r10, %rdi
+; FALLBACK13-NEXT:    movq -64(%rsp,%r8), %r11
+; FALLBACK13-NEXT:    movq -56(%rsp,%r8), %rbx
+; FALLBACK13-NEXT:    shldq %cl, %rbx, %r10
+; FALLBACK13-NEXT:    movq -16(%rsp,%r8), %r14
+; FALLBACK13-NEXT:    movq %r14, %r15
+; FALLBACK13-NEXT:    shldq %cl, %r9, %r15
+; FALLBACK13-NEXT:    movq -8(%rsp,%r8), %r8
+; FALLBACK13-NEXT:    shldq %cl, %r14, %r8
+; FALLBACK13-NEXT:    movq %r11, %r9
+; FALLBACK13-NEXT:    shlq %cl, %r9
+; FALLBACK13-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT:    shldq %cl, %r11, %rbx
+; FALLBACK13-NEXT:    movq %r8, 56(%rdx)
+; FALLBACK13-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK13-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK13-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK13-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK13-NEXT:    movq %rax, 32(%rdx)
+; FALLBACK13-NEXT:    movq %rsi, 40(%rdx)
+; FALLBACK13-NEXT:    movq %r9, (%rdx)
+; FALLBACK13-NEXT:    popq %rbx
+; FALLBACK13-NEXT:    popq %r14
+; FALLBACK13-NEXT:    popq %r15
+; FALLBACK13-NEXT:    vzeroupper
+; FALLBACK13-NEXT:    retq
+;
+; FALLBACK14-LABEL: shl_64bytes:
+; FALLBACK14:       # %bb.0:
+; FALLBACK14-NEXT:    pushq %rbp
+; FALLBACK14-NEXT:    pushq %r15
+; FALLBACK14-NEXT:    pushq %r14
+; FALLBACK14-NEXT:    pushq %r13
+; FALLBACK14-NEXT:    pushq %r12
+; FALLBACK14-NEXT:    pushq %rbx
+; FALLBACK14-NEXT:    subq $24, %rsp
+; FALLBACK14-NEXT:    vmovups (%rdi), %zmm0
+; FALLBACK14-NEXT:    movl (%rsi), %eax
+; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK14-NEXT:    andl $56, %ecx
+; FALLBACK14-NEXT:    andl $56, %eax
+; FALLBACK14-NEXT:    negl %eax
+; FALLBACK14-NEXT:    movslq %eax, %rsi
+; FALLBACK14-NEXT:    movq -8(%rsp,%rsi), %rax
+; FALLBACK14-NEXT:    shlxq %rcx, %rax, %r12
+; FALLBACK14-NEXT:    movq -16(%rsp,%rsi), %rdi
+; FALLBACK14-NEXT:    shlxq %rcx, %rdi, %r15
+; FALLBACK14-NEXT:    movq -24(%rsp,%rsi), %r13
+; FALLBACK14-NEXT:    shlxq %rcx, %r13, %r8
+; FALLBACK14-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; FALLBACK14-NEXT:    movq -32(%rsp,%rsi), %r11
+; FALLBACK14-NEXT:    shlxq %rcx, %r11, %r10
+; FALLBACK14-NEXT:    movq -40(%rsp,%rsi), %r14
+; FALLBACK14-NEXT:    shlxq %rcx, %r14, %rbx
+; FALLBACK14-NEXT:    movl %ecx, %r9d
+; FALLBACK14-NEXT:    notb %r9b
+; FALLBACK14-NEXT:    shrq %rdi
+; FALLBACK14-NEXT:    shrxq %r9, %rdi, %rdi
+; FALLBACK14-NEXT:    orq %r12, %rdi
+; FALLBACK14-NEXT:    movq (%rsp,%rsi), %rbp
+; FALLBACK14-NEXT:    shlxq %rcx, %rbp, %r8
+; FALLBACK14-NEXT:    shrq %r13
+; FALLBACK14-NEXT:    shrxq %r9, %r13, %r12
+; FALLBACK14-NEXT:    orq %r15, %r12
+; FALLBACK14-NEXT:    shlxq %rcx, 8(%rsp,%rsi), %r15
+; FALLBACK14-NEXT:    movq -48(%rsp,%rsi), %rsi
+; FALLBACK14-NEXT:    shlxq %rcx, %rsi, %rcx
+; FALLBACK14-NEXT:    shrq %r11
+; FALLBACK14-NEXT:    shrxq %r9, %r11, %r11
+; FALLBACK14-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; FALLBACK14-NEXT:    shrq %r14
+; FALLBACK14-NEXT:    shrxq %r9, %r14, %r14
+; FALLBACK14-NEXT:    orq %r10, %r14
+; FALLBACK14-NEXT:    shrq %rsi
+; FALLBACK14-NEXT:    shrxq %r9, %rsi, %rsi
+; FALLBACK14-NEXT:    orq %rbx, %rsi
+; FALLBACK14-NEXT:    shrq %rax
+; FALLBACK14-NEXT:    shrxq %r9, %rax, %rax
+; FALLBACK14-NEXT:    orq %r8, %rax
+; FALLBACK14-NEXT:    shrq %rbp
+; FALLBACK14-NEXT:    shrxq %r9, %rbp, %r8
+; FALLBACK14-NEXT:    orq %r15, %r8
+; FALLBACK14-NEXT:    movq %rcx, (%rdx)
+; FALLBACK14-NEXT:    movq %r8, 56(%rdx)
+; FALLBACK14-NEXT:    movq %rax, 48(%rdx)
+; FALLBACK14-NEXT:    movq %rsi, 8(%rdx)
+; FALLBACK14-NEXT:    movq %r14, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK14-NEXT:    movq %r12, 32(%rdx)
+; FALLBACK14-NEXT:    movq %rdi, 40(%rdx)
+; FALLBACK14-NEXT:    addq $24, %rsp
+; FALLBACK14-NEXT:    popq %rbx
+; FALLBACK14-NEXT:    popq %r12
+; FALLBACK14-NEXT:    popq %r13
+; FALLBACK14-NEXT:    popq %r14
+; FALLBACK14-NEXT:    popq %r15
+; FALLBACK14-NEXT:    popq %rbp
+; FALLBACK14-NEXT:    vzeroupper
+; FALLBACK14-NEXT:    retq
+;
+; FALLBACK15-LABEL: shl_64bytes:
+; FALLBACK15:       # %bb.0:
+; FALLBACK15-NEXT:    pushq %r15
+; FALLBACK15-NEXT:    pushq %r14
+; FALLBACK15-NEXT:    pushq %rbx
+; FALLBACK15-NEXT:    vmovups (%rdi), %zmm0
+; FALLBACK15-NEXT:    movl (%rsi), %eax
+; FALLBACK15-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK15-NEXT:    andl $56, %ecx
+; FALLBACK15-NEXT:    andl $56, %eax
+; FALLBACK15-NEXT:    negl %eax
+; FALLBACK15-NEXT:    movslq %eax, %r8
+; FALLBACK15-NEXT:    movq -32(%rsp,%r8), %rax
+; FALLBACK15-NEXT:    movq -24(%rsp,%r8), %r9
+; FALLBACK15-NEXT:    movq %r9, %rsi
+; FALLBACK15-NEXT:    shldq %cl, %rax, %rsi
+; FALLBACK15-NEXT:    movq -40(%rsp,%r8), %rdi
+; FALLBACK15-NEXT:    shldq %cl, %rdi, %rax
+; FALLBACK15-NEXT:    movq -48(%rsp,%r8), %r10
+; FALLBACK15-NEXT:    shldq %cl, %r10, %rdi
+; FALLBACK15-NEXT:    movq -64(%rsp,%r8), %r11
+; FALLBACK15-NEXT:    movq -56(%rsp,%r8), %rbx
+; FALLBACK15-NEXT:    shldq %cl, %rbx, %r10
+; FALLBACK15-NEXT:    movq -16(%rsp,%r8), %r14
+; FALLBACK15-NEXT:    movq %r14, %r15
+; FALLBACK15-NEXT:    shldq %cl, %r9, %r15
+; FALLBACK15-NEXT:    movq -8(%rsp,%r8), %r8
+; FALLBACK15-NEXT:    shldq %cl, %r14, %r8
+; FALLBACK15-NEXT:    shlxq %rcx, %r11, %r9
+; FALLBACK15-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK15-NEXT:    shldq %cl, %r11, %rbx
+; FALLBACK15-NEXT:    movq %r8, 56(%rdx)
+; FALLBACK15-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK15-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK15-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK15-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK15-NEXT:    movq %rax, 32(%rdx)
+; FALLBACK15-NEXT:    movq %rsi, 40(%rdx)
+; FALLBACK15-NEXT:    movq %r9, (%rdx)
+; FALLBACK15-NEXT:    popq %rbx
+; FALLBACK15-NEXT:    popq %r14
+; FALLBACK15-NEXT:    popq %r15
+; FALLBACK15-NEXT:    vzeroupper
+; FALLBACK15-NEXT:    retq
+;
+; FALLBACK16-LABEL: shl_64bytes:
+; FALLBACK16:       # %bb.0:
+; FALLBACK16-NEXT:    pushl %ebp
+; FALLBACK16-NEXT:    pushl %ebx
+; FALLBACK16-NEXT:    pushl %edi
+; FALLBACK16-NEXT:    pushl %esi
+; FALLBACK16-NEXT:    subl $204, %esp
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl (%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 4(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 8(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 12(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 16(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 20(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 24(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 28(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 32(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 36(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 40(%eax), %ebp
+; FALLBACK16-NEXT:    movl 44(%eax), %ebx
+; FALLBACK16-NEXT:    movl 48(%eax), %edi
+; FALLBACK16-NEXT:    movl 52(%eax), %esi
+; FALLBACK16-NEXT:    movl 56(%eax), %edx
+; FALLBACK16-NEXT:    movl 60(%eax), %ecx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl (%eax), %eax
+; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, %edx
+; FALLBACK16-NEXT:    andl $60, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT:    subl %edx, %ecx
+; FALLBACK16-NEXT:    movl (%ecx), %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 4(%ecx), %edx
+; FALLBACK16-NEXT:    movl %ecx, %ebp
+; FALLBACK16-NEXT:    shll $3, %eax
+; FALLBACK16-NEXT:    andl $24, %eax
+; FALLBACK16-NEXT:    movl %edx, %esi
+; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    shrl %edi
+; FALLBACK16-NEXT:    movb %al, %ch
+; FALLBACK16-NEXT:    notb %ch
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    orl %esi, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 12(%ebp), %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    movl 8(%ebp), %esi
+; FALLBACK16-NEXT:    movl %ebp, %edi
+; FALLBACK16-NEXT:    movl %esi, %ebp
+; FALLBACK16-NEXT:    shrl %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    orl %ebx, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    shrl %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    orl %esi, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %edi, %ebp
+; FALLBACK16-NEXT:    movl 20(%edi), %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    movl 16(%edi), %esi
+; FALLBACK16-NEXT:    movl %esi, %edx
+; FALLBACK16-NEXT:    shrl %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    orl %ebx, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %edi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    orl %esi, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %ebp, %edx
+; FALLBACK16-NEXT:    movl 28(%ebp), %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    movl 24(%ebp), %esi
+; FALLBACK16-NEXT:    movl %esi, %edi
+; FALLBACK16-NEXT:    shrl %edi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    orl %ebx, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    orl %esi, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 36(%edx), %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    movl 32(%edx), %esi
+; FALLBACK16-NEXT:    movl %edx, %ebp
+; FALLBACK16-NEXT:    movl %esi, %edi
+; FALLBACK16-NEXT:    shrl %edi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    orl %ebx, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    orl %esi, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 44(%ebp), %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    movl 40(%ebp), %esi
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %esi, %edx
+; FALLBACK16-NEXT:    shrl %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    orl %ebx, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    orl %esi, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 52(%ebp), %esi
+; FALLBACK16-NEXT:    movl %esi, %edi
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    negl %edx
+; FALLBACK16-NEXT:    movl 176(%esp,%edx), %ebx
+; FALLBACK16-NEXT:    movl %ebx, %ebp
+; FALLBACK16-NEXT:    shrl %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    orl %edi, %ebp
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    orl %ebx, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK16-NEXT:    movl 60(%edi), %edx
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    movl 56(%edi), %ebx
+; FALLBACK16-NEXT:    movl %ebx, %edi
+; FALLBACK16-NEXT:    shrl %edi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    orl %edx, %edi
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    shrl %esi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    orl %ebx, %esi
+; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl %edx, (%eax)
+; FALLBACK16-NEXT:    movl %esi, 56(%eax)
+; FALLBACK16-NEXT:    movl %edi, 60(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 48(%eax)
+; FALLBACK16-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK16-NEXT:    addl $204, %esp
+; FALLBACK16-NEXT:    popl %esi
+; FALLBACK16-NEXT:    popl %edi
+; FALLBACK16-NEXT:    popl %ebx
+; FALLBACK16-NEXT:    popl %ebp
+; FALLBACK16-NEXT:    retl
+;
+; FALLBACK17-LABEL: shl_64bytes:
+; FALLBACK17:       # %bb.0:
+; FALLBACK17-NEXT:    pushl %ebp
+; FALLBACK17-NEXT:    pushl %ebx
+; FALLBACK17-NEXT:    pushl %edi
+; FALLBACK17-NEXT:    pushl %esi
+; FALLBACK17-NEXT:    subl $188, %esp
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT:    movl (%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 4(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 8(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 12(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 16(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 20(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 24(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 28(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 32(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 36(%ecx), %eax
+; FALLBACK17-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 40(%ecx), %ebp
+; FALLBACK17-NEXT:    movl 44(%ecx), %ebx
+; FALLBACK17-NEXT:    movl 48(%ecx), %edi
+; FALLBACK17-NEXT:    movl 52(%ecx), %esi
+; FALLBACK17-NEXT:    movl 56(%ecx), %edx
+; FALLBACK17-NEXT:    movl 60(%ecx), %eax
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT:    movl (%ecx), %ecx
+; FALLBACK17-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ecx, %ebp
+; FALLBACK17-NEXT:    andl $60, %ebp
+; FALLBACK17-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT:    subl %ebp, %eax
+; FALLBACK17-NEXT:    movl 8(%eax), %esi
+; FALLBACK17-NEXT:    movl 12(%eax), %edx
+; FALLBACK17-NEXT:    shll $3, %ecx
+; FALLBACK17-NEXT:    andl $24, %ecx
+; FALLBACK17-NEXT:    movl %edx, %edi
+; FALLBACK17-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 4(%eax), %edi
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 16(%eax), %edi
+; FALLBACK17-NEXT:    movl 20(%eax), %esi
+; FALLBACK17-NEXT:    movl %esi, %ebx
+; FALLBACK17-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK17-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 24(%eax), %edi
+; FALLBACK17-NEXT:    movl 28(%eax), %edx
+; FALLBACK17-NEXT:    movl %edx, %ebx
+; FALLBACK17-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK17-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 32(%eax), %edi
+; FALLBACK17-NEXT:    movl 36(%eax), %esi
+; FALLBACK17-NEXT:    movl %esi, %ebx
+; FALLBACK17-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK17-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 40(%eax), %edx
+; FALLBACK17-NEXT:    movl 44(%eax), %edi
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK17-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 56(%eax), %edx
+; FALLBACK17-NEXT:    movl 60(%eax), %edi
+; FALLBACK17-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK17-NEXT:    movl (%eax), %ebx
+; FALLBACK17-NEXT:    movl 52(%eax), %esi
+; FALLBACK17-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK17-NEXT:    negl %ebp
+; FALLBACK17-NEXT:    movl 160(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK17-NEXT:    movl %edi, 60(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    shldl %cl, %ebx, %edx
+; FALLBACK17-NEXT:    shll %cl, %ebx
+; FALLBACK17-NEXT:    shldl %cl, %eax, %esi
+; FALLBACK17-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK17-NEXT:    shldl %cl, %edi, %eax
+; FALLBACK17-NEXT:    movl %eax, 48(%ebp)
+; FALLBACK17-NEXT:    movl %esi, 52(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK17-NEXT:    movl %ebx, (%ebp)
+; FALLBACK17-NEXT:    movl %edx, 4(%ebp)
+; FALLBACK17-NEXT:    addl $188, %esp
+; FALLBACK17-NEXT:    popl %esi
+; FALLBACK17-NEXT:    popl %edi
+; FALLBACK17-NEXT:    popl %ebx
+; FALLBACK17-NEXT:    popl %ebp
+; FALLBACK17-NEXT:    retl
+;
+; FALLBACK18-LABEL: shl_64bytes:
+; FALLBACK18:       # %bb.0:
+; FALLBACK18-NEXT:    pushl %ebp
+; FALLBACK18-NEXT:    pushl %ebx
+; FALLBACK18-NEXT:    pushl %edi
+; FALLBACK18-NEXT:    pushl %esi
+; FALLBACK18-NEXT:    subl $204, %esp
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl (%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 4(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 8(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 12(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 16(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 20(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 24(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 28(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 32(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 36(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 40(%eax), %ebx
+; FALLBACK18-NEXT:    movl 44(%eax), %edi
+; FALLBACK18-NEXT:    movl 48(%eax), %esi
+; FALLBACK18-NEXT:    movl 52(%eax), %edx
+; FALLBACK18-NEXT:    movl 56(%eax), %ecx
+; FALLBACK18-NEXT:    movl 60(%eax), %eax
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK18-NEXT:    movl (%ebp), %ebp
+; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    leal (,%ebp,8), %edx
+; FALLBACK18-NEXT:    andl $24, %edx
+; FALLBACK18-NEXT:    andl $60, %ebp
+; FALLBACK18-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; FALLBACK18-NEXT:    subl %ebp, %edi
+; FALLBACK18-NEXT:    movl (%edi), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 4(%edi), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl %edx, %ebx
+; FALLBACK18-NEXT:    notb %bl
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %esi
+; FALLBACK18-NEXT:    shlxl %edx, %eax, %ecx
+; FALLBACK18-NEXT:    orl %ecx, %esi
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 8(%edi), %esi
+; FALLBACK18-NEXT:    movl %esi, %ecx
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT:    movl 12(%edi), %ecx
+; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    shrl %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT:    orl %esi, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 16(%edi), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrl %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT:    movl 20(%edi), %esi
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT:    orl %eax, %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 24(%edi), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT:    movl 28(%edi), %ecx
+; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrl %esi
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK18-NEXT:    orl %eax, %esi
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 32(%edi), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrl %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT:    movl 36(%edi), %esi
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT:    orl %eax, %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 40(%edi), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT:    movl 44(%edi), %ecx
+; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrl %esi
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK18-NEXT:    orl %eax, %esi
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 48(%edi), %esi
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrl %esi
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    movl 52(%edi), %esi
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %ebp
+; FALLBACK18-NEXT:    orl %eax, %ebp
+; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    negl %eax
+; FALLBACK18-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
+; FALLBACK18-NEXT:    movl 56(%edi), %eax
+; FALLBACK18-NEXT:    shlxl %edx, %eax, %edx
+; FALLBACK18-NEXT:    shrl %esi
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK18-NEXT:    orl %edx, %esi
+; FALLBACK18-NEXT:    shrl %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT:    orl %eax, %ecx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, (%eax)
+; FALLBACK18-NEXT:    movl %esi, 56(%eax)
+; FALLBACK18-NEXT:    movl %ecx, 60(%eax)
+; FALLBACK18-NEXT:    movl %ebp, 48(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 52(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK18-NEXT:    addl $204, %esp
+; FALLBACK18-NEXT:    popl %esi
+; FALLBACK18-NEXT:    popl %edi
+; FALLBACK18-NEXT:    popl %ebx
+; FALLBACK18-NEXT:    popl %ebp
+; FALLBACK18-NEXT:    retl
+;
+; FALLBACK19-LABEL: shl_64bytes:
+; FALLBACK19:       # %bb.0:
+; FALLBACK19-NEXT:    pushl %ebp
+; FALLBACK19-NEXT:    pushl %ebx
+; FALLBACK19-NEXT:    pushl %edi
+; FALLBACK19-NEXT:    pushl %esi
+; FALLBACK19-NEXT:    subl $204, %esp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT:    movl (%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 4(%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 8(%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 12(%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 16(%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 20(%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 24(%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 28(%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 32(%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 36(%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 40(%ebp), %ebx
+; FALLBACK19-NEXT:    movl 44(%ebp), %edi
+; FALLBACK19-NEXT:    movl 48(%ebp), %esi
+; FALLBACK19-NEXT:    movl 52(%ebp), %edx
+; FALLBACK19-NEXT:    movl 56(%ebp), %ecx
+; FALLBACK19-NEXT:    movl 60(%ebp), %eax
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT:    movl (%ebp), %ebp
+; FALLBACK19-NEXT:    xorps %xmm0, %xmm0
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    leal (,%ebp,8), %ecx
+; FALLBACK19-NEXT:    andl $24, %ecx
+; FALLBACK19-NEXT:    andl $60, %ebp
+; FALLBACK19-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT:    subl %ebp, %eax
+; FALLBACK19-NEXT:    movl 4(%eax), %esi
+; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 8(%eax), %edi
+; FALLBACK19-NEXT:    movl 12(%eax), %edx
+; FALLBACK19-NEXT:    movl %edx, %ebx
+; FALLBACK19-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK19-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 16(%eax), %edi
+; FALLBACK19-NEXT:    movl 20(%eax), %esi
+; FALLBACK19-NEXT:    movl %esi, %ebx
+; FALLBACK19-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK19-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 24(%eax), %edi
+; FALLBACK19-NEXT:    movl 28(%eax), %edx
+; FALLBACK19-NEXT:    movl %edx, %ebx
+; FALLBACK19-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK19-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 32(%eax), %edi
+; FALLBACK19-NEXT:    movl 36(%eax), %esi
+; FALLBACK19-NEXT:    movl %esi, %ebx
+; FALLBACK19-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK19-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 40(%eax), %ebx
+; FALLBACK19-NEXT:    movl 44(%eax), %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shldl %cl, %ebx, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shldl %cl, %esi, %ebx
+; FALLBACK19-NEXT:    movl 56(%eax), %edx
+; FALLBACK19-NEXT:    movl 60(%eax), %edi
+; FALLBACK19-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK19-NEXT:    movl (%eax), %esi
+; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 52(%eax), %esi
+; FALLBACK19-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK19-NEXT:    negl %ebp
+; FALLBACK19-NEXT:    movl 176(%esp,%ebp), %ebp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT:    movl %edx, 56(%eax)
+; FALLBACK19-NEXT:    movl %edi, 60(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    shlxl %ecx, %edx, %edi
+; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK19-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK19-NEXT:    shldl %cl, %ebp, %esi
+; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    shldl %cl, %edx, %ebp
+; FALLBACK19-NEXT:    movl %ebp, 48(%eax)
+; FALLBACK19-NEXT:    movl %esi, 52(%eax)
+; FALLBACK19-NEXT:    movl %ebx, 40(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK19-NEXT:    movl %edi, 4(%eax)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, (%eax)
+; FALLBACK19-NEXT:    addl $204, %esp
+; FALLBACK19-NEXT:    popl %esi
+; FALLBACK19-NEXT:    popl %edi
+; FALLBACK19-NEXT:    popl %ebx
+; FALLBACK19-NEXT:    popl %ebp
+; FALLBACK19-NEXT:    retl
+;
+; FALLBACK20-LABEL: shl_64bytes:
+; FALLBACK20:       # %bb.0:
+; FALLBACK20-NEXT:    pushl %ebp
+; FALLBACK20-NEXT:    pushl %ebx
+; FALLBACK20-NEXT:    pushl %edi
+; FALLBACK20-NEXT:    pushl %esi
+; FALLBACK20-NEXT:    subl $204, %esp
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    movups (%ecx), %xmm0
+; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT:    movups 32(%ecx), %xmm2
+; FALLBACK20-NEXT:    movups 48(%ecx), %xmm3
+; FALLBACK20-NEXT:    movl (%eax), %eax
+; FALLBACK20-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, %edx
+; FALLBACK20-NEXT:    andl $60, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    subl %edx, %ecx
+; FALLBACK20-NEXT:    movl (%ecx), %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 4(%ecx), %edx
+; FALLBACK20-NEXT:    movl %ecx, %ebp
+; FALLBACK20-NEXT:    shll $3, %eax
+; FALLBACK20-NEXT:    andl $24, %eax
+; FALLBACK20-NEXT:    movl %edx, %esi
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    shrl %edi
+; FALLBACK20-NEXT:    movb %al, %ch
+; FALLBACK20-NEXT:    notb %ch
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    orl %esi, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 12(%ebp), %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    movl 8(%ebp), %esi
+; FALLBACK20-NEXT:    movl %ebp, %edi
+; FALLBACK20-NEXT:    movl %esi, %ebp
+; FALLBACK20-NEXT:    shrl %ebp
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    orl %ebx, %ebp
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    shrl %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    orl %esi, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %edi, %ebp
+; FALLBACK20-NEXT:    movl 20(%edi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    movl 16(%edi), %esi
+; FALLBACK20-NEXT:    movl %esi, %edx
+; FALLBACK20-NEXT:    shrl %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    orl %ebx, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %edi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    orl %esi, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %ebp, %edx
+; FALLBACK20-NEXT:    movl 28(%ebp), %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    movl 24(%ebp), %esi
+; FALLBACK20-NEXT:    movl %esi, %edi
+; FALLBACK20-NEXT:    shrl %edi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    orl %ebx, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %ebp
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    orl %esi, %ebp
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 36(%edx), %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    movl 32(%edx), %esi
+; FALLBACK20-NEXT:    movl %edx, %ebp
+; FALLBACK20-NEXT:    movl %esi, %edi
+; FALLBACK20-NEXT:    shrl %edi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    orl %ebx, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    orl %esi, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 44(%ebp), %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    movl 40(%ebp), %esi
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %esi, %edx
+; FALLBACK20-NEXT:    shrl %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    orl %ebx, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    orl %esi, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 52(%ebp), %esi
+; FALLBACK20-NEXT:    movl %esi, %edi
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT:    negl %edx
+; FALLBACK20-NEXT:    movl 176(%esp,%edx), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %ebp
+; FALLBACK20-NEXT:    shrl %ebp
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    orl %edi, %ebp
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    orl %ebx, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT:    movl 60(%edi), %edx
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    movl 56(%edi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %edi
+; FALLBACK20-NEXT:    shrl %edi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    orl %edx, %edi
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    shrl %esi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    orl %ebx, %esi
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl %edx, (%eax)
+; FALLBACK20-NEXT:    movl %esi, 56(%eax)
+; FALLBACK20-NEXT:    movl %edi, 60(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 48(%eax)
+; FALLBACK20-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK20-NEXT:    addl $204, %esp
+; FALLBACK20-NEXT:    popl %esi
+; FALLBACK20-NEXT:    popl %edi
+; FALLBACK20-NEXT:    popl %ebx
+; FALLBACK20-NEXT:    popl %ebp
+; FALLBACK20-NEXT:    retl
+;
+; FALLBACK21-LABEL: shl_64bytes:
+; FALLBACK21:       # %bb.0:
+; FALLBACK21-NEXT:    pushl %ebp
+; FALLBACK21-NEXT:    pushl %ebx
+; FALLBACK21-NEXT:    pushl %edi
+; FALLBACK21-NEXT:    pushl %esi
+; FALLBACK21-NEXT:    subl $188, %esp
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT:    movups (%ecx), %xmm0
+; FALLBACK21-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK21-NEXT:    movups 32(%ecx), %xmm2
+; FALLBACK21-NEXT:    movups 48(%ecx), %xmm3
+; FALLBACK21-NEXT:    movl (%eax), %ecx
+; FALLBACK21-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %ecx, %ebp
+; FALLBACK21-NEXT:    andl $60, %ebp
+; FALLBACK21-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT:    subl %ebp, %eax
+; FALLBACK21-NEXT:    movl 8(%eax), %esi
+; FALLBACK21-NEXT:    movl 12(%eax), %edx
+; FALLBACK21-NEXT:    shll $3, %ecx
+; FALLBACK21-NEXT:    andl $24, %ecx
+; FALLBACK21-NEXT:    movl %edx, %edi
+; FALLBACK21-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 4(%eax), %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 16(%eax), %edi
+; FALLBACK21-NEXT:    movl 20(%eax), %esi
+; FALLBACK21-NEXT:    movl %esi, %ebx
+; FALLBACK21-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK21-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 24(%eax), %edi
+; FALLBACK21-NEXT:    movl 28(%eax), %edx
+; FALLBACK21-NEXT:    movl %edx, %ebx
+; FALLBACK21-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK21-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 32(%eax), %edi
+; FALLBACK21-NEXT:    movl 36(%eax), %esi
+; FALLBACK21-NEXT:    movl %esi, %ebx
+; FALLBACK21-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK21-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 40(%eax), %edx
+; FALLBACK21-NEXT:    movl 44(%eax), %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK21-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 56(%eax), %edx
+; FALLBACK21-NEXT:    movl 60(%eax), %edi
+; FALLBACK21-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK21-NEXT:    movl (%eax), %ebx
+; FALLBACK21-NEXT:    movl 52(%eax), %esi
+; FALLBACK21-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK21-NEXT:    negl %ebp
+; FALLBACK21-NEXT:    movl 160(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK21-NEXT:    movl %edi, 60(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK21-NEXT:    shldl %cl, %ebx, %edx
+; FALLBACK21-NEXT:    shll %cl, %ebx
+; FALLBACK21-NEXT:    shldl %cl, %eax, %esi
+; FALLBACK21-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK21-NEXT:    shldl %cl, %edi, %eax
+; FALLBACK21-NEXT:    movl %eax, 48(%ebp)
+; FALLBACK21-NEXT:    movl %esi, 52(%ebp)
+; FALLBACK21-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK21-NEXT:    movl %ebx, (%ebp)
+; FALLBACK21-NEXT:    movl %edx, 4(%ebp)
+; FALLBACK21-NEXT:    addl $188, %esp
+; FALLBACK21-NEXT:    popl %esi
+; FALLBACK21-NEXT:    popl %edi
+; FALLBACK21-NEXT:    popl %ebx
+; FALLBACK21-NEXT:    popl %ebp
+; FALLBACK21-NEXT:    retl
+;
+; FALLBACK22-LABEL: shl_64bytes:
+; FALLBACK22:       # %bb.0:
+; FALLBACK22-NEXT:    pushl %ebp
+; FALLBACK22-NEXT:    pushl %ebx
+; FALLBACK22-NEXT:    pushl %edi
+; FALLBACK22-NEXT:    pushl %esi
+; FALLBACK22-NEXT:    subl $204, %esp
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT:    movups (%ecx), %xmm0
+; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT:    movups 32(%ecx), %xmm2
+; FALLBACK22-NEXT:    movups 48(%ecx), %xmm3
+; FALLBACK22-NEXT:    movl (%eax), %eax
+; FALLBACK22-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    leal (,%eax,8), %edx
+; FALLBACK22-NEXT:    andl $24, %edx
+; FALLBACK22-NEXT:    andl $60, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; FALLBACK22-NEXT:    subl %eax, %edi
+; FALLBACK22-NEXT:    movl (%edi), %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 4(%edi), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl %edx, %ebx
+; FALLBACK22-NEXT:    notb %bl
+; FALLBACK22-NEXT:    shrl %ecx
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %esi
+; FALLBACK22-NEXT:    shlxl %edx, %eax, %ecx
+; FALLBACK22-NEXT:    orl %ecx, %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 8(%edi), %esi
+; FALLBACK22-NEXT:    movl %esi, %ecx
+; FALLBACK22-NEXT:    shrl %ecx
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT:    movl 12(%edi), %ecx
+; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    shrl %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT:    orl %esi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 16(%edi), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrl %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT:    movl 20(%edi), %esi
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrl %ecx
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT:    orl %eax, %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 24(%edi), %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrl %ecx
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT:    movl 28(%edi), %ecx
+; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrl %esi
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK22-NEXT:    orl %eax, %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 32(%edi), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrl %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT:    movl 36(%edi), %esi
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrl %ecx
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT:    orl %eax, %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 40(%edi), %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrl %ecx
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT:    movl 44(%edi), %ecx
+; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrl %esi
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK22-NEXT:    orl %eax, %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 48(%edi), %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrl %esi
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK22-NEXT:    movl 52(%edi), %esi
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrl %ecx
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ebp
+; FALLBACK22-NEXT:    orl %eax, %ebp
+; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    negl %eax
+; FALLBACK22-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
+; FALLBACK22-NEXT:    movl 56(%edi), %eax
+; FALLBACK22-NEXT:    shlxl %edx, %eax, %edx
+; FALLBACK22-NEXT:    shrl %esi
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK22-NEXT:    orl %edx, %esi
+; FALLBACK22-NEXT:    shrl %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT:    orl %eax, %ecx
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %edx, (%eax)
+; FALLBACK22-NEXT:    movl %esi, 56(%eax)
+; FALLBACK22-NEXT:    movl %ecx, 60(%eax)
+; FALLBACK22-NEXT:    movl %ebp, 48(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 52(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK22-NEXT:    addl $204, %esp
+; FALLBACK22-NEXT:    popl %esi
+; FALLBACK22-NEXT:    popl %edi
+; FALLBACK22-NEXT:    popl %ebx
+; FALLBACK22-NEXT:    popl %ebp
+; FALLBACK22-NEXT:    retl
+;
+; FALLBACK23-LABEL: shl_64bytes:
+; FALLBACK23:       # %bb.0:
+; FALLBACK23-NEXT:    pushl %ebp
+; FALLBACK23-NEXT:    pushl %ebx
+; FALLBACK23-NEXT:    pushl %edi
+; FALLBACK23-NEXT:    pushl %esi
+; FALLBACK23-NEXT:    subl $204, %esp
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT:    movups (%ecx), %xmm0
+; FALLBACK23-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK23-NEXT:    movups 32(%ecx), %xmm2
+; FALLBACK23-NEXT:    movups 48(%ecx), %xmm3
+; FALLBACK23-NEXT:    movl (%eax), %ebp
+; FALLBACK23-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    leal (,%ebp,8), %ecx
+; FALLBACK23-NEXT:    andl $24, %ecx
+; FALLBACK23-NEXT:    andl $60, %ebp
+; FALLBACK23-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT:    subl %ebp, %eax
+; FALLBACK23-NEXT:    movl 4(%eax), %esi
+; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 8(%eax), %edi
+; FALLBACK23-NEXT:    movl 12(%eax), %edx
+; FALLBACK23-NEXT:    movl %edx, %ebx
+; FALLBACK23-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 16(%eax), %edi
+; FALLBACK23-NEXT:    movl 20(%eax), %esi
+; FALLBACK23-NEXT:    movl %esi, %ebx
+; FALLBACK23-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 24(%eax), %edi
+; FALLBACK23-NEXT:    movl 28(%eax), %edx
+; FALLBACK23-NEXT:    movl %edx, %ebx
+; FALLBACK23-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 32(%eax), %edi
+; FALLBACK23-NEXT:    movl 36(%eax), %esi
+; FALLBACK23-NEXT:    movl %esi, %ebx
+; FALLBACK23-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 40(%eax), %ebx
+; FALLBACK23-NEXT:    movl 44(%eax), %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shldl %cl, %ebx, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shldl %cl, %esi, %ebx
+; FALLBACK23-NEXT:    movl 56(%eax), %edx
+; FALLBACK23-NEXT:    movl 60(%eax), %edi
+; FALLBACK23-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK23-NEXT:    movl (%eax), %esi
+; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 52(%eax), %esi
+; FALLBACK23-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK23-NEXT:    negl %ebp
+; FALLBACK23-NEXT:    movl 176(%esp,%ebp), %ebp
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT:    movl %edx, 56(%eax)
+; FALLBACK23-NEXT:    movl %edi, 60(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT:    shlxl %ecx, %edx, %edi
+; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK23-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK23-NEXT:    shldl %cl, %ebp, %esi
+; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT:    shldl %cl, %edx, %ebp
+; FALLBACK23-NEXT:    movl %ebp, 48(%eax)
+; FALLBACK23-NEXT:    movl %esi, 52(%eax)
+; FALLBACK23-NEXT:    movl %ebx, 40(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK23-NEXT:    movl %edi, 4(%eax)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, (%eax)
+; FALLBACK23-NEXT:    addl $204, %esp
+; FALLBACK23-NEXT:    popl %esi
+; FALLBACK23-NEXT:    popl %edi
+; FALLBACK23-NEXT:    popl %ebx
+; FALLBACK23-NEXT:    popl %ebp
+; FALLBACK23-NEXT:    retl
+;
+; FALLBACK24-LABEL: shl_64bytes:
+; FALLBACK24:       # %bb.0:
+; FALLBACK24-NEXT:    pushl %ebp
+; FALLBACK24-NEXT:    pushl %ebx
+; FALLBACK24-NEXT:    pushl %edi
+; FALLBACK24-NEXT:    pushl %esi
+; FALLBACK24-NEXT:    subl $204, %esp
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT:    vmovups 32(%ecx), %ymm1
+; FALLBACK24-NEXT:    movl (%eax), %eax
+; FALLBACK24-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK24-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, %edx
+; FALLBACK24-NEXT:    andl $60, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    subl %edx, %ecx
+; FALLBACK24-NEXT:    movl (%ecx), %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 4(%ecx), %edx
+; FALLBACK24-NEXT:    movl %ecx, %ebp
+; FALLBACK24-NEXT:    shll $3, %eax
+; FALLBACK24-NEXT:    andl $24, %eax
+; FALLBACK24-NEXT:    movl %edx, %esi
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    shrl %edi
+; FALLBACK24-NEXT:    movb %al, %ch
+; FALLBACK24-NEXT:    notb %ch
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    orl %esi, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 12(%ebp), %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    movl 8(%ebp), %esi
+; FALLBACK24-NEXT:    movl %ebp, %edi
+; FALLBACK24-NEXT:    movl %esi, %ebp
+; FALLBACK24-NEXT:    shrl %ebp
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    orl %ebx, %ebp
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    shrl %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    orl %esi, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %edi, %ebp
+; FALLBACK24-NEXT:    movl 20(%edi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    movl 16(%edi), %esi
+; FALLBACK24-NEXT:    movl %esi, %edx
+; FALLBACK24-NEXT:    shrl %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    orl %ebx, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %edi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    orl %esi, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %ebp, %edx
+; FALLBACK24-NEXT:    movl 28(%ebp), %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    movl 24(%ebp), %esi
+; FALLBACK24-NEXT:    movl %esi, %edi
+; FALLBACK24-NEXT:    shrl %edi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    orl %ebx, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %ebp
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    orl %esi, %ebp
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 36(%edx), %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    movl 32(%edx), %esi
+; FALLBACK24-NEXT:    movl %edx, %ebp
+; FALLBACK24-NEXT:    movl %esi, %edi
+; FALLBACK24-NEXT:    shrl %edi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    orl %ebx, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    orl %esi, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 44(%ebp), %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    movl 40(%ebp), %esi
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %esi, %edx
+; FALLBACK24-NEXT:    shrl %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    orl %ebx, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    orl %esi, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 52(%ebp), %esi
+; FALLBACK24-NEXT:    movl %esi, %edi
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    negl %edx
+; FALLBACK24-NEXT:    movl 176(%esp,%edx), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %ebp
+; FALLBACK24-NEXT:    shrl %ebp
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    orl %edi, %ebp
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    orl %ebx, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT:    movl 60(%edi), %edx
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    movl 56(%edi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %edi
+; FALLBACK24-NEXT:    shrl %edi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    orl %edx, %edi
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    shrl %esi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    orl %ebx, %esi
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl %edx, (%eax)
+; FALLBACK24-NEXT:    movl %esi, 56(%eax)
+; FALLBACK24-NEXT:    movl %edi, 60(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 48(%eax)
+; FALLBACK24-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK24-NEXT:    addl $204, %esp
+; FALLBACK24-NEXT:    popl %esi
+; FALLBACK24-NEXT:    popl %edi
+; FALLBACK24-NEXT:    popl %ebx
+; FALLBACK24-NEXT:    popl %ebp
+; FALLBACK24-NEXT:    vzeroupper
+; FALLBACK24-NEXT:    retl
+;
+; FALLBACK25-LABEL: shl_64bytes:
+; FALLBACK25:       # %bb.0:
+; FALLBACK25-NEXT:    pushl %ebp
+; FALLBACK25-NEXT:    pushl %ebx
+; FALLBACK25-NEXT:    pushl %edi
+; FALLBACK25-NEXT:    pushl %esi
+; FALLBACK25-NEXT:    subl $188, %esp
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK25-NEXT:    vmovups 32(%ecx), %ymm1
+; FALLBACK25-NEXT:    movl (%eax), %ecx
+; FALLBACK25-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK25-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %ecx, %ebp
+; FALLBACK25-NEXT:    andl $60, %ebp
+; FALLBACK25-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT:    subl %ebp, %eax
+; FALLBACK25-NEXT:    movl 8(%eax), %esi
+; FALLBACK25-NEXT:    movl 12(%eax), %edx
+; FALLBACK25-NEXT:    shll $3, %ecx
+; FALLBACK25-NEXT:    andl $24, %ecx
+; FALLBACK25-NEXT:    movl %edx, %edi
+; FALLBACK25-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 4(%eax), %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 16(%eax), %edi
+; FALLBACK25-NEXT:    movl 20(%eax), %esi
+; FALLBACK25-NEXT:    movl %esi, %ebx
+; FALLBACK25-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK25-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 24(%eax), %edi
+; FALLBACK25-NEXT:    movl 28(%eax), %edx
+; FALLBACK25-NEXT:    movl %edx, %ebx
+; FALLBACK25-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK25-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 32(%eax), %edi
+; FALLBACK25-NEXT:    movl 36(%eax), %esi
+; FALLBACK25-NEXT:    movl %esi, %ebx
+; FALLBACK25-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK25-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 40(%eax), %edx
+; FALLBACK25-NEXT:    movl 44(%eax), %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK25-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 56(%eax), %edx
+; FALLBACK25-NEXT:    movl 60(%eax), %edi
+; FALLBACK25-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK25-NEXT:    movl (%eax), %ebx
+; FALLBACK25-NEXT:    movl 52(%eax), %esi
+; FALLBACK25-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK25-NEXT:    negl %ebp
+; FALLBACK25-NEXT:    movl 160(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK25-NEXT:    movl %edi, 60(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK25-NEXT:    shldl %cl, %ebx, %edx
+; FALLBACK25-NEXT:    shll %cl, %ebx
+; FALLBACK25-NEXT:    shldl %cl, %eax, %esi
+; FALLBACK25-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK25-NEXT:    shldl %cl, %edi, %eax
+; FALLBACK25-NEXT:    movl %eax, 48(%ebp)
+; FALLBACK25-NEXT:    movl %esi, 52(%ebp)
+; FALLBACK25-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK25-NEXT:    movl %ebx, (%ebp)
+; FALLBACK25-NEXT:    movl %edx, 4(%ebp)
+; FALLBACK25-NEXT:    addl $188, %esp
+; FALLBACK25-NEXT:    popl %esi
+; FALLBACK25-NEXT:    popl %edi
+; FALLBACK25-NEXT:    popl %ebx
+; FALLBACK25-NEXT:    popl %ebp
+; FALLBACK25-NEXT:    vzeroupper
+; FALLBACK25-NEXT:    retl
+;
+; FALLBACK26-LABEL: shl_64bytes:
+; FALLBACK26:       # %bb.0:
+; FALLBACK26-NEXT:    pushl %ebp
+; FALLBACK26-NEXT:    pushl %ebx
+; FALLBACK26-NEXT:    pushl %edi
+; FALLBACK26-NEXT:    pushl %esi
+; FALLBACK26-NEXT:    subl $204, %esp
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT:    vmovups 32(%ecx), %ymm1
+; FALLBACK26-NEXT:    movl (%eax), %eax
+; FALLBACK26-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK26-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    leal (,%eax,8), %edx
+; FALLBACK26-NEXT:    andl $24, %edx
+; FALLBACK26-NEXT:    andl $60, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; FALLBACK26-NEXT:    subl %eax, %edi
+; FALLBACK26-NEXT:    movl (%edi), %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 4(%edi), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl %edx, %ebx
+; FALLBACK26-NEXT:    notb %bl
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %esi
+; FALLBACK26-NEXT:    shlxl %edx, %eax, %ecx
+; FALLBACK26-NEXT:    orl %ecx, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 8(%edi), %esi
+; FALLBACK26-NEXT:    movl %esi, %ecx
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT:    movl 12(%edi), %ecx
+; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %esi
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    shrl %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT:    orl %esi, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 16(%edi), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrl %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT:    movl 20(%edi), %esi
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT:    orl %eax, %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 24(%edi), %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT:    movl 28(%edi), %ecx
+; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrl %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK26-NEXT:    orl %eax, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 32(%edi), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrl %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT:    movl 36(%edi), %esi
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT:    orl %eax, %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 40(%edi), %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT:    movl 44(%edi), %ecx
+; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrl %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK26-NEXT:    orl %eax, %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 48(%edi), %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrl %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    movl 52(%edi), %esi
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ebp
+; FALLBACK26-NEXT:    orl %eax, %ebp
+; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    negl %eax
+; FALLBACK26-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
+; FALLBACK26-NEXT:    movl 56(%edi), %eax
+; FALLBACK26-NEXT:    shlxl %edx, %eax, %edx
+; FALLBACK26-NEXT:    shrl %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK26-NEXT:    orl %edx, %esi
+; FALLBACK26-NEXT:    shrl %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT:    orl %eax, %ecx
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %edx, (%eax)
+; FALLBACK26-NEXT:    movl %esi, 56(%eax)
+; FALLBACK26-NEXT:    movl %ecx, 60(%eax)
+; FALLBACK26-NEXT:    movl %ebp, 48(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 52(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK26-NEXT:    addl $204, %esp
+; FALLBACK26-NEXT:    popl %esi
+; FALLBACK26-NEXT:    popl %edi
+; FALLBACK26-NEXT:    popl %ebx
+; FALLBACK26-NEXT:    popl %ebp
+; FALLBACK26-NEXT:    vzeroupper
+; FALLBACK26-NEXT:    retl
+;
+; FALLBACK27-LABEL: shl_64bytes:
+; FALLBACK27:       # %bb.0:
+; FALLBACK27-NEXT:    pushl %ebp
+; FALLBACK27-NEXT:    pushl %ebx
+; FALLBACK27-NEXT:    pushl %edi
+; FALLBACK27-NEXT:    pushl %esi
+; FALLBACK27-NEXT:    subl $204, %esp
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK27-NEXT:    vmovups 32(%ecx), %ymm1
+; FALLBACK27-NEXT:    movl (%eax), %ebx
+; FALLBACK27-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK27-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    leal (,%ebx,8), %ecx
+; FALLBACK27-NEXT:    andl $24, %ecx
+; FALLBACK27-NEXT:    andl $60, %ebx
+; FALLBACK27-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT:    subl %ebx, %eax
+; FALLBACK27-NEXT:    movl 4(%eax), %esi
+; FALLBACK27-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 8(%eax), %edi
+; FALLBACK27-NEXT:    movl 12(%eax), %edx
+; FALLBACK27-NEXT:    movl %edx, %ebp
+; FALLBACK27-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK27-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 16(%eax), %edi
+; FALLBACK27-NEXT:    movl 20(%eax), %esi
+; FALLBACK27-NEXT:    movl %esi, %ebp
+; FALLBACK27-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK27-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 24(%eax), %edi
+; FALLBACK27-NEXT:    movl 28(%eax), %edx
+; FALLBACK27-NEXT:    movl %edx, %ebp
+; FALLBACK27-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK27-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 32(%eax), %edi
+; FALLBACK27-NEXT:    movl 36(%eax), %esi
+; FALLBACK27-NEXT:    movl %esi, %ebp
+; FALLBACK27-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK27-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 40(%eax), %ebp
+; FALLBACK27-NEXT:    movl 44(%eax), %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shldl %cl, %ebp, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shldl %cl, %esi, %ebp
+; FALLBACK27-NEXT:    movl 56(%eax), %edx
+; FALLBACK27-NEXT:    movl 60(%eax), %edi
+; FALLBACK27-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK27-NEXT:    movl (%eax), %esi
+; FALLBACK27-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 52(%eax), %esi
+; FALLBACK27-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK27-NEXT:    negl %ebx
+; FALLBACK27-NEXT:    movl 176(%esp,%ebx), %ebx
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT:    movl %edx, 56(%eax)
+; FALLBACK27-NEXT:    movl %edi, 60(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT:    shlxl %ecx, %edx, %edi
+; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK27-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK27-NEXT:    shldl %cl, %ebx, %esi
+; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT:    shldl %cl, %edx, %ebx
+; FALLBACK27-NEXT:    movl %ebx, 48(%eax)
+; FALLBACK27-NEXT:    movl %esi, 52(%eax)
+; FALLBACK27-NEXT:    movl %ebp, 40(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK27-NEXT:    movl %edi, 4(%eax)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, (%eax)
+; FALLBACK27-NEXT:    addl $204, %esp
+; FALLBACK27-NEXT:    popl %esi
+; FALLBACK27-NEXT:    popl %edi
+; FALLBACK27-NEXT:    popl %ebx
+; FALLBACK27-NEXT:    popl %ebp
+; FALLBACK27-NEXT:    vzeroupper
+; FALLBACK27-NEXT:    retl
+;
+; FALLBACK28-LABEL: shl_64bytes:
+; FALLBACK28:       # %bb.0:
+; FALLBACK28-NEXT:    pushl %ebp
+; FALLBACK28-NEXT:    pushl %ebx
+; FALLBACK28-NEXT:    pushl %edi
+; FALLBACK28-NEXT:    pushl %esi
+; FALLBACK28-NEXT:    subl $204, %esp
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    vmovups (%ecx), %zmm0
+; FALLBACK28-NEXT:    movl (%eax), %eax
+; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, %edx
+; FALLBACK28-NEXT:    andl $60, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    subl %edx, %ecx
+; FALLBACK28-NEXT:    movl (%ecx), %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 4(%ecx), %edx
+; FALLBACK28-NEXT:    movl %ecx, %ebp
+; FALLBACK28-NEXT:    shll $3, %eax
+; FALLBACK28-NEXT:    andl $24, %eax
+; FALLBACK28-NEXT:    movl %edx, %esi
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    shrl %edi
+; FALLBACK28-NEXT:    movb %al, %ch
+; FALLBACK28-NEXT:    notb %ch
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    orl %esi, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 12(%ebp), %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    movl 8(%ebp), %esi
+; FALLBACK28-NEXT:    movl %ebp, %edi
+; FALLBACK28-NEXT:    movl %esi, %ebp
+; FALLBACK28-NEXT:    shrl %ebp
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    orl %ebx, %ebp
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    shrl %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    orl %esi, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %edi, %ebp
+; FALLBACK28-NEXT:    movl 20(%edi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    movl 16(%edi), %esi
+; FALLBACK28-NEXT:    movl %esi, %edx
+; FALLBACK28-NEXT:    shrl %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    orl %ebx, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %edi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    orl %esi, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %ebp, %edx
+; FALLBACK28-NEXT:    movl 28(%ebp), %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    movl 24(%ebp), %esi
+; FALLBACK28-NEXT:    movl %esi, %edi
+; FALLBACK28-NEXT:    shrl %edi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    orl %ebx, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %ebp
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    orl %esi, %ebp
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 36(%edx), %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    movl 32(%edx), %esi
+; FALLBACK28-NEXT:    movl %edx, %ebp
+; FALLBACK28-NEXT:    movl %esi, %edi
+; FALLBACK28-NEXT:    shrl %edi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    orl %ebx, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    orl %esi, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 44(%ebp), %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    movl 40(%ebp), %esi
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %esi, %edx
+; FALLBACK28-NEXT:    shrl %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    orl %ebx, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    orl %esi, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 52(%ebp), %esi
+; FALLBACK28-NEXT:    movl %esi, %edi
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    negl %edx
+; FALLBACK28-NEXT:    movl 176(%esp,%edx), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %ebp
+; FALLBACK28-NEXT:    shrl %ebp
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    orl %edi, %ebp
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    orl %ebx, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT:    movl 60(%edi), %edx
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    movl 56(%edi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %edi
+; FALLBACK28-NEXT:    shrl %edi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    orl %edx, %edi
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    shrl %esi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    orl %ebx, %esi
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl %edx, (%eax)
+; FALLBACK28-NEXT:    movl %esi, 56(%eax)
+; FALLBACK28-NEXT:    movl %edi, 60(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 48(%eax)
+; FALLBACK28-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK28-NEXT:    addl $204, %esp
+; FALLBACK28-NEXT:    popl %esi
+; FALLBACK28-NEXT:    popl %edi
+; FALLBACK28-NEXT:    popl %ebx
+; FALLBACK28-NEXT:    popl %ebp
+; FALLBACK28-NEXT:    vzeroupper
+; FALLBACK28-NEXT:    retl
+;
+; FALLBACK29-LABEL: shl_64bytes:
+; FALLBACK29:       # %bb.0:
+; FALLBACK29-NEXT:    pushl %ebp
+; FALLBACK29-NEXT:    pushl %ebx
+; FALLBACK29-NEXT:    pushl %edi
+; FALLBACK29-NEXT:    pushl %esi
+; FALLBACK29-NEXT:    subl $188, %esp
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT:    vmovups (%ecx), %zmm0
+; FALLBACK29-NEXT:    movl (%eax), %ecx
+; FALLBACK29-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %ecx, %ebp
+; FALLBACK29-NEXT:    andl $60, %ebp
+; FALLBACK29-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT:    subl %ebp, %eax
+; FALLBACK29-NEXT:    movl 8(%eax), %esi
+; FALLBACK29-NEXT:    movl 12(%eax), %edx
+; FALLBACK29-NEXT:    shll $3, %ecx
+; FALLBACK29-NEXT:    andl $24, %ecx
+; FALLBACK29-NEXT:    movl %edx, %edi
+; FALLBACK29-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 4(%eax), %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shldl %cl, %edi, %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 16(%eax), %edi
+; FALLBACK29-NEXT:    movl 20(%eax), %esi
+; FALLBACK29-NEXT:    movl %esi, %ebx
+; FALLBACK29-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK29-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 24(%eax), %edi
+; FALLBACK29-NEXT:    movl 28(%eax), %edx
+; FALLBACK29-NEXT:    movl %edx, %ebx
+; FALLBACK29-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK29-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 32(%eax), %edi
+; FALLBACK29-NEXT:    movl 36(%eax), %esi
+; FALLBACK29-NEXT:    movl %esi, %ebx
+; FALLBACK29-NEXT:    shldl %cl, %edi, %ebx
+; FALLBACK29-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 40(%eax), %edx
+; FALLBACK29-NEXT:    movl 44(%eax), %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK29-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 56(%eax), %edx
+; FALLBACK29-NEXT:    movl 60(%eax), %edi
+; FALLBACK29-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK29-NEXT:    movl (%eax), %ebx
+; FALLBACK29-NEXT:    movl 52(%eax), %esi
+; FALLBACK29-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK29-NEXT:    negl %ebp
+; FALLBACK29-NEXT:    movl 160(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK29-NEXT:    movl %edi, 60(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK29-NEXT:    shldl %cl, %ebx, %edx
+; FALLBACK29-NEXT:    shll %cl, %ebx
+; FALLBACK29-NEXT:    shldl %cl, %eax, %esi
+; FALLBACK29-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK29-NEXT:    shldl %cl, %edi, %eax
+; FALLBACK29-NEXT:    movl %eax, 48(%ebp)
+; FALLBACK29-NEXT:    movl %esi, 52(%ebp)
+; FALLBACK29-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK29-NEXT:    movl %ebx, (%ebp)
+; FALLBACK29-NEXT:    movl %edx, 4(%ebp)
+; FALLBACK29-NEXT:    addl $188, %esp
+; FALLBACK29-NEXT:    popl %esi
+; FALLBACK29-NEXT:    popl %edi
+; FALLBACK29-NEXT:    popl %ebx
+; FALLBACK29-NEXT:    popl %ebp
+; FALLBACK29-NEXT:    vzeroupper
+; FALLBACK29-NEXT:    retl
+;
+; FALLBACK30-LABEL: shl_64bytes:
+; FALLBACK30:       # %bb.0:
+; FALLBACK30-NEXT:    pushl %ebp
+; FALLBACK30-NEXT:    pushl %ebx
+; FALLBACK30-NEXT:    pushl %edi
+; FALLBACK30-NEXT:    pushl %esi
+; FALLBACK30-NEXT:    subl $204, %esp
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT:    vmovups (%ecx), %zmm0
+; FALLBACK30-NEXT:    movl (%eax), %eax
+; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    leal (,%eax,8), %edx
+; FALLBACK30-NEXT:    andl $24, %edx
+; FALLBACK30-NEXT:    andl $60, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; FALLBACK30-NEXT:    subl %eax, %edi
+; FALLBACK30-NEXT:    movl (%edi), %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 4(%edi), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl %edx, %ebx
+; FALLBACK30-NEXT:    notb %bl
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %esi
+; FALLBACK30-NEXT:    shlxl %edx, %eax, %ecx
+; FALLBACK30-NEXT:    orl %ecx, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 8(%edi), %esi
+; FALLBACK30-NEXT:    movl %esi, %ecx
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT:    movl 12(%edi), %ecx
+; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %esi
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    shrl %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT:    orl %esi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 16(%edi), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrl %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT:    movl 20(%edi), %esi
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT:    orl %eax, %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 24(%edi), %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT:    movl 28(%edi), %ecx
+; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrl %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK30-NEXT:    orl %eax, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 32(%edi), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrl %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT:    movl 36(%edi), %esi
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT:    orl %eax, %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 40(%edi), %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT:    movl 44(%edi), %ecx
+; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrl %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK30-NEXT:    orl %eax, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 48(%edi), %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrl %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK30-NEXT:    movl 52(%edi), %esi
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ebp
+; FALLBACK30-NEXT:    orl %eax, %ebp
+; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    negl %eax
+; FALLBACK30-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
+; FALLBACK30-NEXT:    movl 56(%edi), %eax
+; FALLBACK30-NEXT:    shlxl %edx, %eax, %edx
+; FALLBACK30-NEXT:    shrl %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
+; FALLBACK30-NEXT:    orl %edx, %esi
+; FALLBACK30-NEXT:    shrl %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT:    orl %eax, %ecx
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %edx, (%eax)
+; FALLBACK30-NEXT:    movl %esi, 56(%eax)
+; FALLBACK30-NEXT:    movl %ecx, 60(%eax)
+; FALLBACK30-NEXT:    movl %ebp, 48(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 52(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK30-NEXT:    addl $204, %esp
+; FALLBACK30-NEXT:    popl %esi
+; FALLBACK30-NEXT:    popl %edi
+; FALLBACK30-NEXT:    popl %ebx
+; FALLBACK30-NEXT:    popl %ebp
+; FALLBACK30-NEXT:    vzeroupper
+; FALLBACK30-NEXT:    retl
+;
+; FALLBACK31-LABEL: shl_64bytes:
+; FALLBACK31:       # %bb.0:
+; FALLBACK31-NEXT:    pushl %ebp
+; FALLBACK31-NEXT:    pushl %ebx
+; FALLBACK31-NEXT:    pushl %edi
+; FALLBACK31-NEXT:    pushl %esi
+; FALLBACK31-NEXT:    subl $204, %esp
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT:    vmovups (%ecx), %zmm0
+; FALLBACK31-NEXT:    movl (%eax), %ebx
+; FALLBACK31-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    leal (,%ebx,8), %ecx
+; FALLBACK31-NEXT:    andl $24, %ecx
+; FALLBACK31-NEXT:    andl $60, %ebx
+; FALLBACK31-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT:    subl %ebx, %eax
+; FALLBACK31-NEXT:    movl 4(%eax), %esi
+; FALLBACK31-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 8(%eax), %edi
+; FALLBACK31-NEXT:    movl 12(%eax), %edx
+; FALLBACK31-NEXT:    movl %edx, %ebp
+; FALLBACK31-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK31-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 16(%eax), %edi
+; FALLBACK31-NEXT:    movl 20(%eax), %esi
+; FALLBACK31-NEXT:    movl %esi, %ebp
+; FALLBACK31-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK31-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 24(%eax), %edi
+; FALLBACK31-NEXT:    movl 28(%eax), %edx
+; FALLBACK31-NEXT:    movl %edx, %ebp
+; FALLBACK31-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK31-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shldl %cl, %esi, %edi
+; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 32(%eax), %edi
+; FALLBACK31-NEXT:    movl 36(%eax), %esi
+; FALLBACK31-NEXT:    movl %esi, %ebp
+; FALLBACK31-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK31-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 40(%eax), %ebp
+; FALLBACK31-NEXT:    movl 44(%eax), %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shldl %cl, %ebp, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shldl %cl, %esi, %ebp
+; FALLBACK31-NEXT:    movl 56(%eax), %edx
+; FALLBACK31-NEXT:    movl 60(%eax), %edi
+; FALLBACK31-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK31-NEXT:    movl (%eax), %esi
+; FALLBACK31-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 52(%eax), %esi
+; FALLBACK31-NEXT:    shldl %cl, %esi, %edx
+; FALLBACK31-NEXT:    negl %ebx
+; FALLBACK31-NEXT:    movl 176(%esp,%ebx), %ebx
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT:    movl %edx, 56(%eax)
+; FALLBACK31-NEXT:    movl %edi, 60(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT:    shlxl %ecx, %edx, %edi
+; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK31-NEXT:    shldl %cl, %edx, %edi
+; FALLBACK31-NEXT:    shldl %cl, %ebx, %esi
+; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT:    shldl %cl, %edx, %ebx
+; FALLBACK31-NEXT:    movl %ebx, 48(%eax)
+; FALLBACK31-NEXT:    movl %esi, 52(%eax)
+; FALLBACK31-NEXT:    movl %ebp, 40(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK31-NEXT:    movl %edi, 4(%eax)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, (%eax)
+; FALLBACK31-NEXT:    addl $204, %esp
+; FALLBACK31-NEXT:    popl %esi
+; FALLBACK31-NEXT:    popl %edi
+; FALLBACK31-NEXT:    popl %ebx
+; FALLBACK31-NEXT:    popl %ebp
+; FALLBACK31-NEXT:    vzeroupper
+; FALLBACK31-NEXT:    retl
+  %src = load i512, ptr %src.ptr, align 1
+  %byteOff = load i512, ptr %byteOff.ptr, align 1
+  %bitOff = shl i512 %byteOff, 3
+  %res = shl i512 %src, %bitOff
+  store i512 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
+; X64-SSE2-LABEL: shl_64bytes_qwordOff:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    pushq %rbx
 ; X64-SSE2-NEXT:    movq (%rdi), %rax
@@ -2012,6 +19811,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-SSE2-NEXT:    movq 48(%rdi), %rbx
 ; X64-SSE2-NEXT:    movq 56(%rdi), %rdi
 ; X64-SSE2-NEXT:    movl (%rsi), %esi
+; X64-SSE2-NEXT:    xorps %xmm0, %xmm0
+; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
@@ -2020,15 +19824,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    andl $63, %esi
+; X64-SSE2-NEXT:    shll $3, %esi
+; X64-SSE2-NEXT:    andl $56, %esi
 ; X64-SSE2-NEXT:    negl %esi
 ; X64-SSE2-NEXT:    movslq %esi, %rax
 ; X64-SSE2-NEXT:    movq -64(%rsp,%rax), %rcx
@@ -2050,23 +19847,25 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-SSE2-NEXT:    popq %rbx
 ; X64-SSE2-NEXT:    retq
 ;
-; X64-SSE42-LABEL: shl_64bytes:
+; X64-SSE42-LABEL: shl_64bytes_qwordOff:
 ; X64-SSE42:       # %bb.0:
+; X64-SSE42-NEXT:    pushq %rax
 ; X64-SSE42-NEXT:    movups (%rdi), %xmm0
 ; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
 ; X64-SSE42-NEXT:    movups 32(%rdi), %xmm2
 ; X64-SSE42-NEXT:    movups 48(%rdi), %xmm3
 ; X64-SSE42-NEXT:    movl (%rsi), %eax
 ; X64-SSE42-NEXT:    xorps %xmm4, %xmm4
-; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm3, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    andl $63, %eax
+; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    shll $3, %eax
+; X64-SSE42-NEXT:    andl $56, %eax
 ; X64-SSE42-NEXT:    negl %eax
 ; X64-SSE42-NEXT:    cltq
 ; X64-SSE42-NEXT:    movups -64(%rsp,%rax), %xmm0
@@ -2077,10 +19876,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm2, 32(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
+; X64-SSE42-NEXT:    popq %rax
 ; X64-SSE42-NEXT:    retq
 ;
-; X64-AVX1-LABEL: shl_64bytes:
+; X64-AVX1-LABEL: shl_64bytes_qwordOff:
 ; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    pushq %rax
 ; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
 ; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
 ; X64-AVX1-NEXT:    movl (%rsi), %eax
@@ -2089,7 +19890,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
 ; X64-AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; X64-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT:    andl $63, %eax
+; X64-AVX1-NEXT:    shll $3, %eax
+; X64-AVX1-NEXT:    andl $56, %eax
 ; X64-AVX1-NEXT:    negl %eax
 ; X64-AVX1-NEXT:    cltq
 ; X64-AVX1-NEXT:    vmovups -64(%rsp,%rax), %xmm0
@@ -2100,17 +19902,20 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-AVX1-NEXT:    vmovups %xmm1, 16(%rdx)
 ; X64-AVX1-NEXT:    vmovups %xmm2, 32(%rdx)
 ; X64-AVX1-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX1-NEXT:    popq %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
-; X64-AVX512-LABEL: shl_64bytes:
+; X64-AVX512-LABEL: shl_64bytes_qwordOff:
 ; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    pushq %rax
 ; X64-AVX512-NEXT:    vmovups (%rdi), %zmm0
 ; X64-AVX512-NEXT:    movl (%rsi), %eax
 ; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
 ; X64-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT:    andl $63, %eax
+; X64-AVX512-NEXT:    shll $3, %eax
+; X64-AVX512-NEXT:    andl $56, %eax
 ; X64-AVX512-NEXT:    negl %eax
 ; X64-AVX512-NEXT:    cltq
 ; X64-AVX512-NEXT:    vmovups -64(%rsp,%rax), %xmm0
@@ -2121,117 +19926,108 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-AVX512-NEXT:    vmovups %xmm1, 16(%rdx)
 ; X64-AVX512-NEXT:    vmovups %xmm2, 32(%rdx)
 ; X64-AVX512-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX512-NEXT:    popq %rax
 ; X64-AVX512-NEXT:    vzeroupper
 ; X64-AVX512-NEXT:    retq
 ;
-; X86-SSE2-LABEL: shl_64bytes:
+; X86-SSE2-LABEL: shl_64bytes_qwordOff:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    pushl %ebx
 ; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $168, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl (%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 8(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 12(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 16(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 20(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 24(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 28(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 32(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 36(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 40(%eax), %ebp
-; X86-SSE2-NEXT:    movl 44(%eax), %ebx
-; X86-SSE2-NEXT:    movl 48(%eax), %edi
-; X86-SSE2-NEXT:    movl 52(%eax), %esi
-; X86-SSE2-NEXT:    movl 56(%eax), %edx
-; X86-SSE2-NEXT:    movl 60(%eax), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl (%eax), %eax
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    subl $188, %esp
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl (%ecx), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 4(%ecx), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 8(%ecx), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 12(%ecx), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 16(%ecx), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 20(%ecx), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 24(%ecx), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 28(%ecx), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 32(%ecx), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 36(%ecx), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 40(%ecx), %ebp
+; X86-SSE2-NEXT:    movl 44(%ecx), %ebx
+; X86-SSE2-NEXT:    movl 48(%ecx), %edi
+; X86-SSE2-NEXT:    movl 52(%ecx), %esi
+; X86-SSE2-NEXT:    movl 56(%ecx), %edx
+; X86-SSE2-NEXT:    movl 60(%ecx), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl (%ecx), %ecx
+; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andl $63, %eax
-; X86-SSE2-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    subl %eax, %ecx
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl (%ecx), %edx
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    shll $3, %ecx
+; X86-SSE2-NEXT:    andl $56, %ecx
+; X86-SSE2-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    subl %ecx, %eax
+; X86-SSE2-NEXT:    movl (%eax), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 4(%ecx), %edx
+; X86-SSE2-NEXT:    movl 4(%eax), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 12(%ecx), %edx
+; X86-SSE2-NEXT:    movl 12(%eax), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 8(%ecx), %edx
+; X86-SSE2-NEXT:    movl 8(%eax), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 20(%ecx), %edx
+; X86-SSE2-NEXT:    movl 20(%eax), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 16(%ecx), %edx
+; X86-SSE2-NEXT:    movl 16(%eax), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 28(%ecx), %edx
+; X86-SSE2-NEXT:    movl 28(%eax), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 24(%ecx), %edx
+; X86-SSE2-NEXT:    movl 24(%eax), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 36(%ecx), %edx
+; X86-SSE2-NEXT:    movl 36(%eax), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 32(%ecx), %edx
-; X86-SSE2-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 44(%ecx), %ebp
-; X86-SSE2-NEXT:    movl 40(%ecx), %ebx
-; X86-SSE2-NEXT:    movl 52(%ecx), %edi
-; X86-SSE2-NEXT:    movl 60(%ecx), %esi
-; X86-SSE2-NEXT:    movl 56(%ecx), %edx
-; X86-SSE2-NEXT:    negl %eax
-; X86-SSE2-NEXT:    movl 152(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 32(%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 44(%eax), %ebp
+; X86-SSE2-NEXT:    movl 40(%eax), %ebx
+; X86-SSE2-NEXT:    movl 52(%eax), %edi
+; X86-SSE2-NEXT:    movl 60(%eax), %esi
+; X86-SSE2-NEXT:    movl 56(%eax), %edx
+; X86-SSE2-NEXT:    negl %ecx
+; X86-SSE2-NEXT:    movl 160(%esp,%ecx), %ecx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl %edx, 56(%eax)
 ; X86-SSE2-NEXT:    movl %esi, 60(%eax)
@@ -2239,7 +20035,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    movl %edi, 52(%eax)
 ; X86-SSE2-NEXT:    movl %ebx, 40(%eax)
 ; X86-SSE2-NEXT:    movl %ebp, 44(%eax)
-; X86-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, 32(%eax)
 ; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, 36(%eax)
@@ -2259,16 +20055,16 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    movl %ecx, (%eax)
 ; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    addl $168, %esp
+; X86-SSE2-NEXT:    addl $188, %esp
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    popl %ebx
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-SSE42-LABEL: shl_64bytes:
+; X86-SSE42-LABEL: shl_64bytes_qwordOff:
 ; X86-SSE42:       # %bb.0:
-; X86-SSE42-NEXT:    subl $128, %esp
+; X86-SSE42-NEXT:    subl $140, %esp
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -2278,15 +20074,16 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE42-NEXT:    movups 48(%edx), %xmm3
 ; X86-SSE42-NEXT:    movl (%ecx), %ecx
 ; X86-SSE42-NEXT:    xorps %xmm4, %xmm4
-; X86-SSE42-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm4, (%esp)
-; X86-SSE42-NEXT:    movups %xmm3, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    andl $63, %ecx
+; X86-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm4, (%esp)
+; X86-SSE42-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    shll $3, %ecx
+; X86-SSE42-NEXT:    andl $56, %ecx
 ; X86-SSE42-NEXT:    leal {{[0-9]+}}(%esp), %edx
 ; X86-SSE42-NEXT:    subl %ecx, %edx
 ; X86-SSE42-NEXT:    movups (%edx), %xmm0
@@ -2298,12 +20095,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE42-NEXT:    movups %xmm2, 32(%eax)
 ; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
 ; X86-SSE42-NEXT:    movups %xmm0, (%eax)
-; X86-SSE42-NEXT:    addl $128, %esp
+; X86-SSE42-NEXT:    addl $140, %esp
 ; X86-SSE42-NEXT:    retl
 ;
-; X86-AVX1-LABEL: shl_64bytes:
+; X86-AVX1-LABEL: shl_64bytes_qwordOff:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    subl $128, %esp
+; X86-AVX1-NEXT:    subl $140, %esp
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -2315,7 +20112,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-AVX1-NEXT:    vmovups %ymm2, (%esp)
 ; X86-AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; X86-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    andl $63, %ecx
+; X86-AVX1-NEXT:    shll $3, %ecx
+; X86-AVX1-NEXT:    andl $56, %ecx
 ; X86-AVX1-NEXT:    leal {{[0-9]+}}(%esp), %edx
 ; X86-AVX1-NEXT:    subl %ecx, %edx
 ; X86-AVX1-NEXT:    vmovups (%edx), %xmm0
@@ -2327,13 +20125,13 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-AVX1-NEXT:    vmovups %xmm2, 32(%eax)
 ; X86-AVX1-NEXT:    vmovups %xmm1, 16(%eax)
 ; X86-AVX1-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX1-NEXT:    addl $128, %esp
+; X86-AVX1-NEXT:    addl $140, %esp
 ; X86-AVX1-NEXT:    vzeroupper
 ; X86-AVX1-NEXT:    retl
 ;
-; X86-AVX512-LABEL: shl_64bytes:
+; X86-AVX512-LABEL: shl_64bytes_qwordOff:
 ; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    subl $128, %esp
+; X86-AVX512-NEXT:    subl $140, %esp
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -2342,7 +20140,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX512-NEXT:    vmovups %zmm1, (%esp)
 ; X86-AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-AVX512-NEXT:    andl $63, %ecx
+; X86-AVX512-NEXT:    shll $3, %ecx
+; X86-AVX512-NEXT:    andl $56, %ecx
 ; X86-AVX512-NEXT:    leal {{[0-9]+}}(%esp), %edx
 ; X86-AVX512-NEXT:    subl %ecx, %edx
 ; X86-AVX512-NEXT:    vmovups (%edx), %xmm0
@@ -2354,18 +20153,4121 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-AVX512-NEXT:    vmovups %xmm2, 32(%eax)
 ; X86-AVX512-NEXT:    vmovups %xmm1, 16(%eax)
 ; X86-AVX512-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX512-NEXT:    addl $128, %esp
+; X86-AVX512-NEXT:    addl $140, %esp
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
   %src = load i512, ptr %src.ptr, align 1
-  %byteOff = load i512, ptr %byteOff.ptr, align 1
-  %bitOff = shl i512 %byteOff, 3
+  %qwordOff = load i512, ptr %qwordOff.ptr, align 1
+  %bitOff = shl i512 %qwordOff, 6
   %res = shl i512 %src, %bitOff
   store i512 %res, ptr %dst, align 1
   ret void
 }
+
 define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: ashr_64bytes:
+; FALLBACK0-LABEL: ashr_64bytes:
+; FALLBACK0:       # %bb.0:
+; FALLBACK0-NEXT:    pushq %r15
+; FALLBACK0-NEXT:    pushq %r14
+; FALLBACK0-NEXT:    pushq %r13
+; FALLBACK0-NEXT:    pushq %r12
+; FALLBACK0-NEXT:    pushq %rbx
+; FALLBACK0-NEXT:    movq (%rdi), %rax
+; FALLBACK0-NEXT:    movq 8(%rdi), %rcx
+; FALLBACK0-NEXT:    movq 16(%rdi), %r8
+; FALLBACK0-NEXT:    movq 24(%rdi), %r9
+; FALLBACK0-NEXT:    movq 32(%rdi), %r10
+; FALLBACK0-NEXT:    movq 40(%rdi), %r11
+; FALLBACK0-NEXT:    movq 48(%rdi), %rbx
+; FALLBACK0-NEXT:    movq 56(%rdi), %r14
+; FALLBACK0-NEXT:    movl (%rsi), %edi
+; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    sarq $63, %r14
+; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT:    leal (,%rdi,8), %eax
+; FALLBACK0-NEXT:    andl $56, %eax
+; FALLBACK0-NEXT:    andl $56, %edi
+; FALLBACK0-NEXT:    movq -128(%rsp,%rdi), %r10
+; FALLBACK0-NEXT:    movq -120(%rsp,%rdi), %r8
+; FALLBACK0-NEXT:    movq %r8, %r11
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r11
+; FALLBACK0-NEXT:    movl %eax, %esi
+; FALLBACK0-NEXT:    notb %sil
+; FALLBACK0-NEXT:    movq -112(%rsp,%rdi), %rbx
+; FALLBACK0-NEXT:    leaq (%rbx,%rbx), %r9
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r9
+; FALLBACK0-NEXT:    orq %r11, %r9
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r10
+; FALLBACK0-NEXT:    addq %r8, %r8
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r8
+; FALLBACK0-NEXT:    orq %r10, %r8
+; FALLBACK0-NEXT:    movq -104(%rsp,%rdi), %r10
+; FALLBACK0-NEXT:    movq %r10, %r15
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r15
+; FALLBACK0-NEXT:    movq -96(%rsp,%rdi), %r14
+; FALLBACK0-NEXT:    leaq (%r14,%r14), %r11
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r11
+; FALLBACK0-NEXT:    orq %r15, %r11
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %rbx
+; FALLBACK0-NEXT:    addq %r10, %r10
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r10
+; FALLBACK0-NEXT:    orq %rbx, %r10
+; FALLBACK0-NEXT:    movq -88(%rsp,%rdi), %rbx
+; FALLBACK0-NEXT:    movq %rbx, %r12
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r12
+; FALLBACK0-NEXT:    movq -80(%rsp,%rdi), %r13
+; FALLBACK0-NEXT:    leaq (%r13,%r13), %r15
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r15
+; FALLBACK0-NEXT:    orq %r12, %r15
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r14
+; FALLBACK0-NEXT:    addq %rbx, %rbx
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %rbx
+; FALLBACK0-NEXT:    orq %r14, %rbx
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    shrq %cl, %r13
+; FALLBACK0-NEXT:    movq -72(%rsp,%rdi), %rdi
+; FALLBACK0-NEXT:    leaq (%rdi,%rdi), %r14
+; FALLBACK0-NEXT:    movl %esi, %ecx
+; FALLBACK0-NEXT:    shlq %cl, %r14
+; FALLBACK0-NEXT:    orq %r13, %r14
+; FALLBACK0-NEXT:    movl %eax, %ecx
+; FALLBACK0-NEXT:    sarq %cl, %rdi
+; FALLBACK0-NEXT:    movq %rdi, 56(%rdx)
+; FALLBACK0-NEXT:    movq %r14, 48(%rdx)
+; FALLBACK0-NEXT:    movq %rbx, 32(%rdx)
+; FALLBACK0-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK0-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK0-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK0-NEXT:    movq %r8, (%rdx)
+; FALLBACK0-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK0-NEXT:    popq %rbx
+; FALLBACK0-NEXT:    popq %r12
+; FALLBACK0-NEXT:    popq %r13
+; FALLBACK0-NEXT:    popq %r14
+; FALLBACK0-NEXT:    popq %r15
+; FALLBACK0-NEXT:    retq
+;
+; FALLBACK1-LABEL: ashr_64bytes:
+; FALLBACK1:       # %bb.0:
+; FALLBACK1-NEXT:    pushq %r15
+; FALLBACK1-NEXT:    pushq %r14
+; FALLBACK1-NEXT:    pushq %rbx
+; FALLBACK1-NEXT:    movq (%rdi), %rcx
+; FALLBACK1-NEXT:    movq 8(%rdi), %r8
+; FALLBACK1-NEXT:    movq 16(%rdi), %r9
+; FALLBACK1-NEXT:    movq 24(%rdi), %r10
+; FALLBACK1-NEXT:    movq 32(%rdi), %r11
+; FALLBACK1-NEXT:    movq 40(%rdi), %rbx
+; FALLBACK1-NEXT:    movq 48(%rdi), %r14
+; FALLBACK1-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK1-NEXT:    movl (%rsi), %eax
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    sarq $63, %rdi
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK1-NEXT:    andl $56, %ecx
+; FALLBACK1-NEXT:    andl $56, %eax
+; FALLBACK1-NEXT:    movq -112(%rsp,%rax), %rdi
+; FALLBACK1-NEXT:    movq -128(%rsp,%rax), %rsi
+; FALLBACK1-NEXT:    movq -120(%rsp,%rax), %r9
+; FALLBACK1-NEXT:    movq %r9, %r8
+; FALLBACK1-NEXT:    shrdq %cl, %rdi, %r8
+; FALLBACK1-NEXT:    movq -96(%rsp,%rax), %r10
+; FALLBACK1-NEXT:    movq -104(%rsp,%rax), %r11
+; FALLBACK1-NEXT:    movq %r11, %rbx
+; FALLBACK1-NEXT:    shrdq %cl, %r10, %rbx
+; FALLBACK1-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK1-NEXT:    movq -80(%rsp,%rax), %r11
+; FALLBACK1-NEXT:    movq -88(%rsp,%rax), %r14
+; FALLBACK1-NEXT:    movq %r14, %r15
+; FALLBACK1-NEXT:    shrdq %cl, %r11, %r15
+; FALLBACK1-NEXT:    shrdq %cl, %r14, %r10
+; FALLBACK1-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK1-NEXT:    shrdq %cl, %rax, %r11
+; FALLBACK1-NEXT:    shrdq %cl, %r9, %rsi
+; FALLBACK1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT:    sarq %cl, %rax
+; FALLBACK1-NEXT:    movq %r11, 48(%rdx)
+; FALLBACK1-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK1-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK1-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK1-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK1-NEXT:    movq %rbx, 24(%rdx)
+; FALLBACK1-NEXT:    movq %rsi, (%rdx)
+; FALLBACK1-NEXT:    movq %r8, 8(%rdx)
+; FALLBACK1-NEXT:    popq %rbx
+; FALLBACK1-NEXT:    popq %r14
+; FALLBACK1-NEXT:    popq %r15
+; FALLBACK1-NEXT:    retq
+;
+; FALLBACK2-LABEL: ashr_64bytes:
+; FALLBACK2:       # %bb.0:
+; FALLBACK2-NEXT:    pushq %rbp
+; FALLBACK2-NEXT:    pushq %r15
+; FALLBACK2-NEXT:    pushq %r14
+; FALLBACK2-NEXT:    pushq %r13
+; FALLBACK2-NEXT:    pushq %r12
+; FALLBACK2-NEXT:    pushq %rbx
+; FALLBACK2-NEXT:    pushq %rax
+; FALLBACK2-NEXT:    movq (%rdi), %rcx
+; FALLBACK2-NEXT:    movq 8(%rdi), %r8
+; FALLBACK2-NEXT:    movq 16(%rdi), %r9
+; FALLBACK2-NEXT:    movq 24(%rdi), %r10
+; FALLBACK2-NEXT:    movq 32(%rdi), %r11
+; FALLBACK2-NEXT:    movq 40(%rdi), %rbx
+; FALLBACK2-NEXT:    movq 48(%rdi), %r14
+; FALLBACK2-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK2-NEXT:    movl (%rsi), %eax
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    sarq $63, %rdi
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK2-NEXT:    andl $56, %ecx
+; FALLBACK2-NEXT:    andl $56, %eax
+; FALLBACK2-NEXT:    movq -120(%rsp,%rax), %rdi
+; FALLBACK2-NEXT:    movq -112(%rsp,%rax), %r9
+; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %rbx
+; FALLBACK2-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r13
+; FALLBACK2-NEXT:    movq -104(%rsp,%rax), %rsi
+; FALLBACK2-NEXT:    shrxq %rcx, %rsi, %r8
+; FALLBACK2-NEXT:    movq -96(%rsp,%rax), %r10
+; FALLBACK2-NEXT:    shrxq %rcx, %r9, %r11
+; FALLBACK2-NEXT:    movq -88(%rsp,%rax), %r14
+; FALLBACK2-NEXT:    shrxq %rcx, %r14, %r15
+; FALLBACK2-NEXT:    shrxq %rcx, %r10, %rbp
+; FALLBACK2-NEXT:    movl %ecx, %r12d
+; FALLBACK2-NEXT:    notb %r12b
+; FALLBACK2-NEXT:    addq %r9, %r9
+; FALLBACK2-NEXT:    shlxq %r12, %r9, %r9
+; FALLBACK2-NEXT:    orq %rbx, %r9
+; FALLBACK2-NEXT:    addq %rdi, %rdi
+; FALLBACK2-NEXT:    shlxq %r12, %rdi, %rdi
+; FALLBACK2-NEXT:    orq %r13, %rdi
+; FALLBACK2-NEXT:    movq -80(%rsp,%rax), %rbx
+; FALLBACK2-NEXT:    shrxq %rcx, %rbx, %r13
+; FALLBACK2-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK2-NEXT:    sarxq %rcx, %rax, %rcx
+; FALLBACK2-NEXT:    addq %r10, %r10
+; FALLBACK2-NEXT:    shlxq %r12, %r10, %r10
+; FALLBACK2-NEXT:    orq %r8, %r10
+; FALLBACK2-NEXT:    addq %rsi, %rsi
+; FALLBACK2-NEXT:    shlxq %r12, %rsi, %rsi
+; FALLBACK2-NEXT:    orq %r11, %rsi
+; FALLBACK2-NEXT:    leaq (%rbx,%rbx), %r8
+; FALLBACK2-NEXT:    shlxq %r12, %r8, %r8
+; FALLBACK2-NEXT:    orq %r15, %r8
+; FALLBACK2-NEXT:    addq %r14, %r14
+; FALLBACK2-NEXT:    shlxq %r12, %r14, %r11
+; FALLBACK2-NEXT:    orq %rbp, %r11
+; FALLBACK2-NEXT:    addq %rax, %rax
+; FALLBACK2-NEXT:    shlxq %r12, %rax, %rax
+; FALLBACK2-NEXT:    orq %r13, %rax
+; FALLBACK2-NEXT:    movq %rcx, 56(%rdx)
+; FALLBACK2-NEXT:    movq %rax, 48(%rdx)
+; FALLBACK2-NEXT:    movq %r11, 32(%rdx)
+; FALLBACK2-NEXT:    movq %r8, 40(%rdx)
+; FALLBACK2-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK2-NEXT:    movq %r10, 24(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, (%rdx)
+; FALLBACK2-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK2-NEXT:    addq $8, %rsp
+; FALLBACK2-NEXT:    popq %rbx
+; FALLBACK2-NEXT:    popq %r12
+; FALLBACK2-NEXT:    popq %r13
+; FALLBACK2-NEXT:    popq %r14
+; FALLBACK2-NEXT:    popq %r15
+; FALLBACK2-NEXT:    popq %rbp
+; FALLBACK2-NEXT:    retq
+;
+; FALLBACK3-LABEL: ashr_64bytes:
+; FALLBACK3:       # %bb.0:
+; FALLBACK3-NEXT:    pushq %r15
+; FALLBACK3-NEXT:    pushq %r14
+; FALLBACK3-NEXT:    pushq %rbx
+; FALLBACK3-NEXT:    movq (%rdi), %rcx
+; FALLBACK3-NEXT:    movq 8(%rdi), %r8
+; FALLBACK3-NEXT:    movq 16(%rdi), %r9
+; FALLBACK3-NEXT:    movq 24(%rdi), %r10
+; FALLBACK3-NEXT:    movq 32(%rdi), %r11
+; FALLBACK3-NEXT:    movq 40(%rdi), %rbx
+; FALLBACK3-NEXT:    movq 48(%rdi), %r14
+; FALLBACK3-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK3-NEXT:    movl (%rsi), %eax
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    sarq $63, %rdi
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK3-NEXT:    andl $56, %ecx
+; FALLBACK3-NEXT:    andl $56, %eax
+; FALLBACK3-NEXT:    movq -112(%rsp,%rax), %rdi
+; FALLBACK3-NEXT:    movq -128(%rsp,%rax), %rsi
+; FALLBACK3-NEXT:    movq -120(%rsp,%rax), %r9
+; FALLBACK3-NEXT:    movq %r9, %r8
+; FALLBACK3-NEXT:    shrdq %cl, %rdi, %r8
+; FALLBACK3-NEXT:    movq -96(%rsp,%rax), %r10
+; FALLBACK3-NEXT:    movq -104(%rsp,%rax), %r11
+; FALLBACK3-NEXT:    movq %r11, %rbx
+; FALLBACK3-NEXT:    shrdq %cl, %r10, %rbx
+; FALLBACK3-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK3-NEXT:    movq -80(%rsp,%rax), %r11
+; FALLBACK3-NEXT:    movq -88(%rsp,%rax), %r14
+; FALLBACK3-NEXT:    movq %r14, %r15
+; FALLBACK3-NEXT:    shrdq %cl, %r11, %r15
+; FALLBACK3-NEXT:    shrdq %cl, %r14, %r10
+; FALLBACK3-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK3-NEXT:    shrdq %cl, %rax, %r11
+; FALLBACK3-NEXT:    sarxq %rcx, %rax, %rax
+; FALLBACK3-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK3-NEXT:    shrdq %cl, %r9, %rsi
+; FALLBACK3-NEXT:    movq %r11, 48(%rdx)
+; FALLBACK3-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK3-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK3-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK3-NEXT:    movq %rbx, 24(%rdx)
+; FALLBACK3-NEXT:    movq %rsi, (%rdx)
+; FALLBACK3-NEXT:    movq %r8, 8(%rdx)
+; FALLBACK3-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK3-NEXT:    popq %rbx
+; FALLBACK3-NEXT:    popq %r14
+; FALLBACK3-NEXT:    popq %r15
+; FALLBACK3-NEXT:    retq
+;
+; FALLBACK4-LABEL: ashr_64bytes:
+; FALLBACK4:       # %bb.0:
+; FALLBACK4-NEXT:    pushq %rbp
+; FALLBACK4-NEXT:    pushq %r15
+; FALLBACK4-NEXT:    pushq %r14
+; FALLBACK4-NEXT:    pushq %r13
+; FALLBACK4-NEXT:    pushq %r12
+; FALLBACK4-NEXT:    pushq %rbx
+; FALLBACK4-NEXT:    pushq %rax
+; FALLBACK4-NEXT:    movups (%rdi), %xmm0
+; FALLBACK4-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK4-NEXT:    movq 48(%rdi), %rax
+; FALLBACK4-NEXT:    movq 56(%rdi), %rcx
+; FALLBACK4-NEXT:    movl (%rsi), %edi
+; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    sarq $63, %rcx
+; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT:    leal (,%rdi,8), %eax
+; FALLBACK4-NEXT:    andl $56, %eax
+; FALLBACK4-NEXT:    andl $56, %edi
+; FALLBACK4-NEXT:    movq -128(%rsp,%rdi), %r10
+; FALLBACK4-NEXT:    movq -120(%rsp,%rdi), %r9
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r10
+; FALLBACK4-NEXT:    movl %eax, %esi
+; FALLBACK4-NEXT:    notb %sil
+; FALLBACK4-NEXT:    leaq (%r9,%r9), %r8
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r8
+; FALLBACK4-NEXT:    orq %r10, %r8
+; FALLBACK4-NEXT:    movq -104(%rsp,%rdi), %r10
+; FALLBACK4-NEXT:    movq %r10, %rbx
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %rbx
+; FALLBACK4-NEXT:    movq -96(%rsp,%rdi), %r12
+; FALLBACK4-NEXT:    leaq (%r12,%r12), %r11
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r11
+; FALLBACK4-NEXT:    orq %rbx, %r11
+; FALLBACK4-NEXT:    movq -112(%rsp,%rdi), %rbx
+; FALLBACK4-NEXT:    movq %rbx, %r14
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r14
+; FALLBACK4-NEXT:    addq %r10, %r10
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r10
+; FALLBACK4-NEXT:    orq %r14, %r10
+; FALLBACK4-NEXT:    movq -88(%rsp,%rdi), %r14
+; FALLBACK4-NEXT:    movq %r14, %r13
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r13
+; FALLBACK4-NEXT:    movq -80(%rsp,%rdi), %rbp
+; FALLBACK4-NEXT:    leaq (%rbp,%rbp), %r15
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r15
+; FALLBACK4-NEXT:    orq %r13, %r15
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r12
+; FALLBACK4-NEXT:    addq %r14, %r14
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r14
+; FALLBACK4-NEXT:    orq %r12, %r14
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %rbp
+; FALLBACK4-NEXT:    movq -72(%rsp,%rdi), %rdi
+; FALLBACK4-NEXT:    leaq (%rdi,%rdi), %r12
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %r12
+; FALLBACK4-NEXT:    orq %rbp, %r12
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    shrq %cl, %r9
+; FALLBACK4-NEXT:    addq %rbx, %rbx
+; FALLBACK4-NEXT:    movl %esi, %ecx
+; FALLBACK4-NEXT:    shlq %cl, %rbx
+; FALLBACK4-NEXT:    orq %r9, %rbx
+; FALLBACK4-NEXT:    movl %eax, %ecx
+; FALLBACK4-NEXT:    sarq %cl, %rdi
+; FALLBACK4-NEXT:    movq %rdi, 56(%rdx)
+; FALLBACK4-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK4-NEXT:    movq %r12, 48(%rdx)
+; FALLBACK4-NEXT:    movq %r14, 32(%rdx)
+; FALLBACK4-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK4-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK4-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK4-NEXT:    movq %r8, (%rdx)
+; FALLBACK4-NEXT:    addq $8, %rsp
+; FALLBACK4-NEXT:    popq %rbx
+; FALLBACK4-NEXT:    popq %r12
+; FALLBACK4-NEXT:    popq %r13
+; FALLBACK4-NEXT:    popq %r14
+; FALLBACK4-NEXT:    popq %r15
+; FALLBACK4-NEXT:    popq %rbp
+; FALLBACK4-NEXT:    retq
+;
+; FALLBACK5-LABEL: ashr_64bytes:
+; FALLBACK5:       # %bb.0:
+; FALLBACK5-NEXT:    pushq %r15
+; FALLBACK5-NEXT:    pushq %r14
+; FALLBACK5-NEXT:    pushq %rbx
+; FALLBACK5-NEXT:    movups (%rdi), %xmm0
+; FALLBACK5-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK5-NEXT:    movq 48(%rdi), %rcx
+; FALLBACK5-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK5-NEXT:    movl (%rsi), %eax
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    sarq $63, %rdi
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK5-NEXT:    andl $56, %ecx
+; FALLBACK5-NEXT:    andl $56, %eax
+; FALLBACK5-NEXT:    movq -96(%rsp,%rax), %rdi
+; FALLBACK5-NEXT:    movq -104(%rsp,%rax), %r9
+; FALLBACK5-NEXT:    movq %r9, %rsi
+; FALLBACK5-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK5-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK5-NEXT:    movq %r10, %r8
+; FALLBACK5-NEXT:    shrdq %cl, %r9, %r8
+; FALLBACK5-NEXT:    movq -80(%rsp,%rax), %r9
+; FALLBACK5-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK5-NEXT:    movq %r11, %rbx
+; FALLBACK5-NEXT:    shrdq %cl, %r9, %rbx
+; FALLBACK5-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK5-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK5-NEXT:    shrdq %cl, %r11, %r9
+; FALLBACK5-NEXT:    movq -128(%rsp,%rax), %r14
+; FALLBACK5-NEXT:    movq -120(%rsp,%rax), %rax
+; FALLBACK5-NEXT:    movq %rax, %r15
+; FALLBACK5-NEXT:    shrdq %cl, %r10, %r15
+; FALLBACK5-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK5-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT:    sarq %cl, %r11
+; FALLBACK5-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK5-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK5-NEXT:    movq %r11, 56(%rdx)
+; FALLBACK5-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK5-NEXT:    movq %rbx, 40(%rdx)
+; FALLBACK5-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK5-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT:    movq %r14, (%rdx)
+; FALLBACK5-NEXT:    popq %rbx
+; FALLBACK5-NEXT:    popq %r14
+; FALLBACK5-NEXT:    popq %r15
+; FALLBACK5-NEXT:    retq
+;
+; FALLBACK6-LABEL: ashr_64bytes:
+; FALLBACK6:       # %bb.0:
+; FALLBACK6-NEXT:    pushq %rbp
+; FALLBACK6-NEXT:    pushq %r15
+; FALLBACK6-NEXT:    pushq %r14
+; FALLBACK6-NEXT:    pushq %r13
+; FALLBACK6-NEXT:    pushq %r12
+; FALLBACK6-NEXT:    pushq %rbx
+; FALLBACK6-NEXT:    pushq %rax
+; FALLBACK6-NEXT:    movups (%rdi), %xmm0
+; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK6-NEXT:    movq 48(%rdi), %rcx
+; FALLBACK6-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK6-NEXT:    movl (%rsi), %eax
+; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    sarq $63, %rdi
+; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    leal (,%rax,8), %esi
+; FALLBACK6-NEXT:    andl $56, %esi
+; FALLBACK6-NEXT:    andl $56, %eax
+; FALLBACK6-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK6-NEXT:    movq -112(%rsp,%rax), %rcx
+; FALLBACK6-NEXT:    movq -104(%rsp,%rax), %rdi
+; FALLBACK6-NEXT:    shrxq %rsi, %rdi, %r12
+; FALLBACK6-NEXT:    movq -96(%rsp,%rax), %r13
+; FALLBACK6-NEXT:    shrxq %rsi, %rcx, %r9
+; FALLBACK6-NEXT:    movq -88(%rsp,%rax), %r10
+; FALLBACK6-NEXT:    shrxq %rsi, %r10, %r14
+; FALLBACK6-NEXT:    shrxq %rsi, %r13, %r15
+; FALLBACK6-NEXT:    movl %esi, %ebx
+; FALLBACK6-NEXT:    notb %bl
+; FALLBACK6-NEXT:    movq -120(%rsp,%rax), %rbp
+; FALLBACK6-NEXT:    leaq (%rbp,%rbp), %r8
+; FALLBACK6-NEXT:    shlxq %rbx, %r8, %r8
+; FALLBACK6-NEXT:    orq %r11, %r8
+; FALLBACK6-NEXT:    leaq (%r13,%r13), %r11
+; FALLBACK6-NEXT:    shlxq %rbx, %r11, %r11
+; FALLBACK6-NEXT:    orq %r12, %r11
+; FALLBACK6-NEXT:    movq -80(%rsp,%rax), %r12
+; FALLBACK6-NEXT:    shrxq %rsi, %r12, %r13
+; FALLBACK6-NEXT:    shrxq %rsi, %rbp, %rbp
+; FALLBACK6-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK6-NEXT:    sarxq %rsi, %rax, %rsi
+; FALLBACK6-NEXT:    addq %rdi, %rdi
+; FALLBACK6-NEXT:    shlxq %rbx, %rdi, %rdi
+; FALLBACK6-NEXT:    orq %r9, %rdi
+; FALLBACK6-NEXT:    leaq (%r12,%r12), %r9
+; FALLBACK6-NEXT:    shlxq %rbx, %r9, %r9
+; FALLBACK6-NEXT:    orq %r14, %r9
+; FALLBACK6-NEXT:    addq %r10, %r10
+; FALLBACK6-NEXT:    shlxq %rbx, %r10, %r10
+; FALLBACK6-NEXT:    orq %r15, %r10
+; FALLBACK6-NEXT:    addq %rax, %rax
+; FALLBACK6-NEXT:    shlxq %rbx, %rax, %rax
+; FALLBACK6-NEXT:    orq %r13, %rax
+; FALLBACK6-NEXT:    addq %rcx, %rcx
+; FALLBACK6-NEXT:    shlxq %rbx, %rcx, %rcx
+; FALLBACK6-NEXT:    orq %rbp, %rcx
+; FALLBACK6-NEXT:    movq %rsi, 56(%rdx)
+; FALLBACK6-NEXT:    movq %rcx, 8(%rdx)
+; FALLBACK6-NEXT:    movq %rax, 48(%rdx)
+; FALLBACK6-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK6-NEXT:    movq %r9, 40(%rdx)
+; FALLBACK6-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK6-NEXT:    movq %r8, (%rdx)
+; FALLBACK6-NEXT:    addq $8, %rsp
+; FALLBACK6-NEXT:    popq %rbx
+; FALLBACK6-NEXT:    popq %r12
+; FALLBACK6-NEXT:    popq %r13
+; FALLBACK6-NEXT:    popq %r14
+; FALLBACK6-NEXT:    popq %r15
+; FALLBACK6-NEXT:    popq %rbp
+; FALLBACK6-NEXT:    retq
+;
+; FALLBACK7-LABEL: ashr_64bytes:
+; FALLBACK7:       # %bb.0:
+; FALLBACK7-NEXT:    pushq %r15
+; FALLBACK7-NEXT:    pushq %r14
+; FALLBACK7-NEXT:    pushq %rbx
+; FALLBACK7-NEXT:    movups (%rdi), %xmm0
+; FALLBACK7-NEXT:    movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT:    movups 32(%rdi), %xmm2
+; FALLBACK7-NEXT:    movq 48(%rdi), %rcx
+; FALLBACK7-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK7-NEXT:    movl (%rsi), %eax
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    sarq $63, %rdi
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK7-NEXT:    andl $56, %ecx
+; FALLBACK7-NEXT:    andl $56, %eax
+; FALLBACK7-NEXT:    movq -96(%rsp,%rax), %rdi
+; FALLBACK7-NEXT:    movq -104(%rsp,%rax), %r9
+; FALLBACK7-NEXT:    movq %r9, %rsi
+; FALLBACK7-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK7-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK7-NEXT:    movq %r10, %r8
+; FALLBACK7-NEXT:    shrdq %cl, %r9, %r8
+; FALLBACK7-NEXT:    movq -80(%rsp,%rax), %r9
+; FALLBACK7-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK7-NEXT:    movq %r11, %rbx
+; FALLBACK7-NEXT:    shrdq %cl, %r9, %rbx
+; FALLBACK7-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK7-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK7-NEXT:    shrdq %cl, %r11, %r9
+; FALLBACK7-NEXT:    movq -128(%rsp,%rax), %r14
+; FALLBACK7-NEXT:    movq -120(%rsp,%rax), %rax
+; FALLBACK7-NEXT:    movq %rax, %r15
+; FALLBACK7-NEXT:    shrdq %cl, %r10, %r15
+; FALLBACK7-NEXT:    sarxq %rcx, %r11, %r10
+; FALLBACK7-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK7-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK7-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK7-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK7-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK7-NEXT:    movq %rbx, 40(%rdx)
+; FALLBACK7-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK7-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK7-NEXT:    movq %r14, (%rdx)
+; FALLBACK7-NEXT:    movq %r10, 56(%rdx)
+; FALLBACK7-NEXT:    popq %rbx
+; FALLBACK7-NEXT:    popq %r14
+; FALLBACK7-NEXT:    popq %r15
+; FALLBACK7-NEXT:    retq
+;
+; FALLBACK8-LABEL: ashr_64bytes:
+; FALLBACK8:       # %bb.0:
+; FALLBACK8-NEXT:    pushq %rbp
+; FALLBACK8-NEXT:    pushq %r15
+; FALLBACK8-NEXT:    pushq %r14
+; FALLBACK8-NEXT:    pushq %r13
+; FALLBACK8-NEXT:    pushq %r12
+; FALLBACK8-NEXT:    pushq %rbx
+; FALLBACK8-NEXT:    pushq %rax
+; FALLBACK8-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT:    vmovups 32(%rdi), %xmm1
+; FALLBACK8-NEXT:    movq 48(%rdi), %rax
+; FALLBACK8-NEXT:    movq 56(%rdi), %rcx
+; FALLBACK8-NEXT:    movl (%rsi), %edi
+; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    sarq $63, %rcx
+; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT:    leal (,%rdi,8), %eax
+; FALLBACK8-NEXT:    andl $56, %eax
+; FALLBACK8-NEXT:    andl $56, %edi
+; FALLBACK8-NEXT:    movq -128(%rsp,%rdi), %r10
+; FALLBACK8-NEXT:    movq -120(%rsp,%rdi), %r9
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r10
+; FALLBACK8-NEXT:    movl %eax, %esi
+; FALLBACK8-NEXT:    notb %sil
+; FALLBACK8-NEXT:    leaq (%r9,%r9), %r8
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r8
+; FALLBACK8-NEXT:    orq %r10, %r8
+; FALLBACK8-NEXT:    movq -104(%rsp,%rdi), %r10
+; FALLBACK8-NEXT:    movq %r10, %rbx
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %rbx
+; FALLBACK8-NEXT:    movq -96(%rsp,%rdi), %r12
+; FALLBACK8-NEXT:    leaq (%r12,%r12), %r11
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r11
+; FALLBACK8-NEXT:    orq %rbx, %r11
+; FALLBACK8-NEXT:    movq -112(%rsp,%rdi), %rbx
+; FALLBACK8-NEXT:    movq %rbx, %r14
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r14
+; FALLBACK8-NEXT:    addq %r10, %r10
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r10
+; FALLBACK8-NEXT:    orq %r14, %r10
+; FALLBACK8-NEXT:    movq -88(%rsp,%rdi), %r14
+; FALLBACK8-NEXT:    movq %r14, %r13
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r13
+; FALLBACK8-NEXT:    movq -80(%rsp,%rdi), %rbp
+; FALLBACK8-NEXT:    leaq (%rbp,%rbp), %r15
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r15
+; FALLBACK8-NEXT:    orq %r13, %r15
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r12
+; FALLBACK8-NEXT:    addq %r14, %r14
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r14
+; FALLBACK8-NEXT:    orq %r12, %r14
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %rbp
+; FALLBACK8-NEXT:    movq -72(%rsp,%rdi), %rdi
+; FALLBACK8-NEXT:    leaq (%rdi,%rdi), %r12
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %r12
+; FALLBACK8-NEXT:    orq %rbp, %r12
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    shrq %cl, %r9
+; FALLBACK8-NEXT:    addq %rbx, %rbx
+; FALLBACK8-NEXT:    movl %esi, %ecx
+; FALLBACK8-NEXT:    shlq %cl, %rbx
+; FALLBACK8-NEXT:    orq %r9, %rbx
+; FALLBACK8-NEXT:    movl %eax, %ecx
+; FALLBACK8-NEXT:    sarq %cl, %rdi
+; FALLBACK8-NEXT:    movq %rdi, 56(%rdx)
+; FALLBACK8-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK8-NEXT:    movq %r12, 48(%rdx)
+; FALLBACK8-NEXT:    movq %r14, 32(%rdx)
+; FALLBACK8-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK8-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK8-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK8-NEXT:    movq %r8, (%rdx)
+; FALLBACK8-NEXT:    addq $8, %rsp
+; FALLBACK8-NEXT:    popq %rbx
+; FALLBACK8-NEXT:    popq %r12
+; FALLBACK8-NEXT:    popq %r13
+; FALLBACK8-NEXT:    popq %r14
+; FALLBACK8-NEXT:    popq %r15
+; FALLBACK8-NEXT:    popq %rbp
+; FALLBACK8-NEXT:    vzeroupper
+; FALLBACK8-NEXT:    retq
+;
+; FALLBACK9-LABEL: ashr_64bytes:
+; FALLBACK9:       # %bb.0:
+; FALLBACK9-NEXT:    pushq %r15
+; FALLBACK9-NEXT:    pushq %r14
+; FALLBACK9-NEXT:    pushq %rbx
+; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT:    vmovups 32(%rdi), %xmm1
+; FALLBACK9-NEXT:    movq 48(%rdi), %rcx
+; FALLBACK9-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK9-NEXT:    movl (%rsi), %eax
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    sarq $63, %rdi
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK9-NEXT:    andl $56, %ecx
+; FALLBACK9-NEXT:    andl $56, %eax
+; FALLBACK9-NEXT:    movq -96(%rsp,%rax), %rdi
+; FALLBACK9-NEXT:    movq -104(%rsp,%rax), %r9
+; FALLBACK9-NEXT:    movq %r9, %rsi
+; FALLBACK9-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK9-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK9-NEXT:    movq %r10, %r8
+; FALLBACK9-NEXT:    shrdq %cl, %r9, %r8
+; FALLBACK9-NEXT:    movq -80(%rsp,%rax), %r9
+; FALLBACK9-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK9-NEXT:    movq %r11, %rbx
+; FALLBACK9-NEXT:    shrdq %cl, %r9, %rbx
+; FALLBACK9-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK9-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK9-NEXT:    shrdq %cl, %r11, %r9
+; FALLBACK9-NEXT:    movq -128(%rsp,%rax), %r14
+; FALLBACK9-NEXT:    movq -120(%rsp,%rax), %rax
+; FALLBACK9-NEXT:    movq %rax, %r15
+; FALLBACK9-NEXT:    shrdq %cl, %r10, %r15
+; FALLBACK9-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK9-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT:    sarq %cl, %r11
+; FALLBACK9-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK9-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK9-NEXT:    movq %r11, 56(%rdx)
+; FALLBACK9-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK9-NEXT:    movq %rbx, 40(%rdx)
+; FALLBACK9-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK9-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT:    movq %r14, (%rdx)
+; FALLBACK9-NEXT:    popq %rbx
+; FALLBACK9-NEXT:    popq %r14
+; FALLBACK9-NEXT:    popq %r15
+; FALLBACK9-NEXT:    vzeroupper
+; FALLBACK9-NEXT:    retq
+;
+; FALLBACK10-LABEL: ashr_64bytes:
+; FALLBACK10:       # %bb.0:
+; FALLBACK10-NEXT:    pushq %rbp
+; FALLBACK10-NEXT:    pushq %r15
+; FALLBACK10-NEXT:    pushq %r14
+; FALLBACK10-NEXT:    pushq %r13
+; FALLBACK10-NEXT:    pushq %r12
+; FALLBACK10-NEXT:    pushq %rbx
+; FALLBACK10-NEXT:    pushq %rax
+; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT:    vmovups 32(%rdi), %xmm1
+; FALLBACK10-NEXT:    movq 48(%rdi), %rcx
+; FALLBACK10-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK10-NEXT:    movl (%rsi), %eax
+; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    sarq $63, %rdi
+; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    leal (,%rax,8), %esi
+; FALLBACK10-NEXT:    andl $56, %esi
+; FALLBACK10-NEXT:    andl $56, %eax
+; FALLBACK10-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK10-NEXT:    movq -112(%rsp,%rax), %rcx
+; FALLBACK10-NEXT:    movq -104(%rsp,%rax), %rdi
+; FALLBACK10-NEXT:    shrxq %rsi, %rdi, %r12
+; FALLBACK10-NEXT:    movq -96(%rsp,%rax), %r13
+; FALLBACK10-NEXT:    shrxq %rsi, %rcx, %r9
+; FALLBACK10-NEXT:    movq -88(%rsp,%rax), %r10
+; FALLBACK10-NEXT:    shrxq %rsi, %r10, %r14
+; FALLBACK10-NEXT:    shrxq %rsi, %r13, %r15
+; FALLBACK10-NEXT:    movl %esi, %ebx
+; FALLBACK10-NEXT:    notb %bl
+; FALLBACK10-NEXT:    movq -120(%rsp,%rax), %rbp
+; FALLBACK10-NEXT:    leaq (%rbp,%rbp), %r8
+; FALLBACK10-NEXT:    shlxq %rbx, %r8, %r8
+; FALLBACK10-NEXT:    orq %r11, %r8
+; FALLBACK10-NEXT:    leaq (%r13,%r13), %r11
+; FALLBACK10-NEXT:    shlxq %rbx, %r11, %r11
+; FALLBACK10-NEXT:    orq %r12, %r11
+; FALLBACK10-NEXT:    movq -80(%rsp,%rax), %r12
+; FALLBACK10-NEXT:    shrxq %rsi, %r12, %r13
+; FALLBACK10-NEXT:    shrxq %rsi, %rbp, %rbp
+; FALLBACK10-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK10-NEXT:    sarxq %rsi, %rax, %rsi
+; FALLBACK10-NEXT:    addq %rdi, %rdi
+; FALLBACK10-NEXT:    shlxq %rbx, %rdi, %rdi
+; FALLBACK10-NEXT:    orq %r9, %rdi
+; FALLBACK10-NEXT:    leaq (%r12,%r12), %r9
+; FALLBACK10-NEXT:    shlxq %rbx, %r9, %r9
+; FALLBACK10-NEXT:    orq %r14, %r9
+; FALLBACK10-NEXT:    addq %r10, %r10
+; FALLBACK10-NEXT:    shlxq %rbx, %r10, %r10
+; FALLBACK10-NEXT:    orq %r15, %r10
+; FALLBACK10-NEXT:    addq %rax, %rax
+; FALLBACK10-NEXT:    shlxq %rbx, %rax, %rax
+; FALLBACK10-NEXT:    orq %r13, %rax
+; FALLBACK10-NEXT:    addq %rcx, %rcx
+; FALLBACK10-NEXT:    shlxq %rbx, %rcx, %rcx
+; FALLBACK10-NEXT:    orq %rbp, %rcx
+; FALLBACK10-NEXT:    movq %rsi, 56(%rdx)
+; FALLBACK10-NEXT:    movq %rcx, 8(%rdx)
+; FALLBACK10-NEXT:    movq %rax, 48(%rdx)
+; FALLBACK10-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK10-NEXT:    movq %r9, 40(%rdx)
+; FALLBACK10-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK10-NEXT:    movq %r8, (%rdx)
+; FALLBACK10-NEXT:    addq $8, %rsp
+; FALLBACK10-NEXT:    popq %rbx
+; FALLBACK10-NEXT:    popq %r12
+; FALLBACK10-NEXT:    popq %r13
+; FALLBACK10-NEXT:    popq %r14
+; FALLBACK10-NEXT:    popq %r15
+; FALLBACK10-NEXT:    popq %rbp
+; FALLBACK10-NEXT:    vzeroupper
+; FALLBACK10-NEXT:    retq
+;
+; FALLBACK11-LABEL: ashr_64bytes:
+; FALLBACK11:       # %bb.0:
+; FALLBACK11-NEXT:    pushq %r15
+; FALLBACK11-NEXT:    pushq %r14
+; FALLBACK11-NEXT:    pushq %rbx
+; FALLBACK11-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT:    vmovups 32(%rdi), %xmm1
+; FALLBACK11-NEXT:    movq 48(%rdi), %rcx
+; FALLBACK11-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK11-NEXT:    movl (%rsi), %eax
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    sarq $63, %rdi
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK11-NEXT:    andl $56, %ecx
+; FALLBACK11-NEXT:    andl $56, %eax
+; FALLBACK11-NEXT:    movq -96(%rsp,%rax), %rdi
+; FALLBACK11-NEXT:    movq -104(%rsp,%rax), %r9
+; FALLBACK11-NEXT:    movq %r9, %rsi
+; FALLBACK11-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK11-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK11-NEXT:    movq %r10, %r8
+; FALLBACK11-NEXT:    shrdq %cl, %r9, %r8
+; FALLBACK11-NEXT:    movq -80(%rsp,%rax), %r9
+; FALLBACK11-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK11-NEXT:    movq %r11, %rbx
+; FALLBACK11-NEXT:    shrdq %cl, %r9, %rbx
+; FALLBACK11-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK11-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK11-NEXT:    shrdq %cl, %r11, %r9
+; FALLBACK11-NEXT:    movq -128(%rsp,%rax), %r14
+; FALLBACK11-NEXT:    movq -120(%rsp,%rax), %rax
+; FALLBACK11-NEXT:    movq %rax, %r15
+; FALLBACK11-NEXT:    shrdq %cl, %r10, %r15
+; FALLBACK11-NEXT:    sarxq %rcx, %r11, %r10
+; FALLBACK11-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK11-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK11-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK11-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK11-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK11-NEXT:    movq %rbx, 40(%rdx)
+; FALLBACK11-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK11-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK11-NEXT:    movq %r14, (%rdx)
+; FALLBACK11-NEXT:    movq %r10, 56(%rdx)
+; FALLBACK11-NEXT:    popq %rbx
+; FALLBACK11-NEXT:    popq %r14
+; FALLBACK11-NEXT:    popq %r15
+; FALLBACK11-NEXT:    vzeroupper
+; FALLBACK11-NEXT:    retq
+;
+; FALLBACK12-LABEL: ashr_64bytes:
+; FALLBACK12:       # %bb.0:
+; FALLBACK12-NEXT:    pushq %rbp
+; FALLBACK12-NEXT:    pushq %r15
+; FALLBACK12-NEXT:    pushq %r14
+; FALLBACK12-NEXT:    pushq %r13
+; FALLBACK12-NEXT:    pushq %r12
+; FALLBACK12-NEXT:    pushq %rbx
+; FALLBACK12-NEXT:    pushq %rax
+; FALLBACK12-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK12-NEXT:    vmovups 32(%rdi), %xmm1
+; FALLBACK12-NEXT:    movq 48(%rdi), %rax
+; FALLBACK12-NEXT:    movq 56(%rdi), %rcx
+; FALLBACK12-NEXT:    movl (%rsi), %edi
+; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    sarq $63, %rcx
+; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT:    leal (,%rdi,8), %eax
+; FALLBACK12-NEXT:    andl $56, %eax
+; FALLBACK12-NEXT:    andl $56, %edi
+; FALLBACK12-NEXT:    movq -128(%rsp,%rdi), %r10
+; FALLBACK12-NEXT:    movq -120(%rsp,%rdi), %r9
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r10
+; FALLBACK12-NEXT:    movl %eax, %esi
+; FALLBACK12-NEXT:    notb %sil
+; FALLBACK12-NEXT:    leaq (%r9,%r9), %r8
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r8
+; FALLBACK12-NEXT:    orq %r10, %r8
+; FALLBACK12-NEXT:    movq -104(%rsp,%rdi), %r10
+; FALLBACK12-NEXT:    movq %r10, %rbx
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %rbx
+; FALLBACK12-NEXT:    movq -96(%rsp,%rdi), %r12
+; FALLBACK12-NEXT:    leaq (%r12,%r12), %r11
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r11
+; FALLBACK12-NEXT:    orq %rbx, %r11
+; FALLBACK12-NEXT:    movq -112(%rsp,%rdi), %rbx
+; FALLBACK12-NEXT:    movq %rbx, %r14
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r14
+; FALLBACK12-NEXT:    addq %r10, %r10
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r10
+; FALLBACK12-NEXT:    orq %r14, %r10
+; FALLBACK12-NEXT:    movq -88(%rsp,%rdi), %r14
+; FALLBACK12-NEXT:    movq %r14, %r13
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r13
+; FALLBACK12-NEXT:    movq -80(%rsp,%rdi), %rbp
+; FALLBACK12-NEXT:    leaq (%rbp,%rbp), %r15
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r15
+; FALLBACK12-NEXT:    orq %r13, %r15
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r12
+; FALLBACK12-NEXT:    addq %r14, %r14
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r14
+; FALLBACK12-NEXT:    orq %r12, %r14
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %rbp
+; FALLBACK12-NEXT:    movq -72(%rsp,%rdi), %rdi
+; FALLBACK12-NEXT:    leaq (%rdi,%rdi), %r12
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %r12
+; FALLBACK12-NEXT:    orq %rbp, %r12
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    shrq %cl, %r9
+; FALLBACK12-NEXT:    addq %rbx, %rbx
+; FALLBACK12-NEXT:    movl %esi, %ecx
+; FALLBACK12-NEXT:    shlq %cl, %rbx
+; FALLBACK12-NEXT:    orq %r9, %rbx
+; FALLBACK12-NEXT:    movl %eax, %ecx
+; FALLBACK12-NEXT:    sarq %cl, %rdi
+; FALLBACK12-NEXT:    movq %rdi, 56(%rdx)
+; FALLBACK12-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK12-NEXT:    movq %r12, 48(%rdx)
+; FALLBACK12-NEXT:    movq %r14, 32(%rdx)
+; FALLBACK12-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK12-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK12-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK12-NEXT:    movq %r8, (%rdx)
+; FALLBACK12-NEXT:    addq $8, %rsp
+; FALLBACK12-NEXT:    popq %rbx
+; FALLBACK12-NEXT:    popq %r12
+; FALLBACK12-NEXT:    popq %r13
+; FALLBACK12-NEXT:    popq %r14
+; FALLBACK12-NEXT:    popq %r15
+; FALLBACK12-NEXT:    popq %rbp
+; FALLBACK12-NEXT:    vzeroupper
+; FALLBACK12-NEXT:    retq
+;
+; FALLBACK13-LABEL: ashr_64bytes:
+; FALLBACK13:       # %bb.0:
+; FALLBACK13-NEXT:    pushq %r15
+; FALLBACK13-NEXT:    pushq %r14
+; FALLBACK13-NEXT:    pushq %rbx
+; FALLBACK13-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK13-NEXT:    vmovups 32(%rdi), %xmm1
+; FALLBACK13-NEXT:    movq 48(%rdi), %rcx
+; FALLBACK13-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK13-NEXT:    movl (%rsi), %eax
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    sarq $63, %rdi
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK13-NEXT:    andl $56, %ecx
+; FALLBACK13-NEXT:    andl $56, %eax
+; FALLBACK13-NEXT:    movq -96(%rsp,%rax), %rdi
+; FALLBACK13-NEXT:    movq -104(%rsp,%rax), %r9
+; FALLBACK13-NEXT:    movq %r9, %rsi
+; FALLBACK13-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK13-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK13-NEXT:    movq %r10, %r8
+; FALLBACK13-NEXT:    shrdq %cl, %r9, %r8
+; FALLBACK13-NEXT:    movq -80(%rsp,%rax), %r9
+; FALLBACK13-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK13-NEXT:    movq %r11, %rbx
+; FALLBACK13-NEXT:    shrdq %cl, %r9, %rbx
+; FALLBACK13-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK13-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK13-NEXT:    shrdq %cl, %r11, %r9
+; FALLBACK13-NEXT:    movq -128(%rsp,%rax), %r14
+; FALLBACK13-NEXT:    movq -120(%rsp,%rax), %rax
+; FALLBACK13-NEXT:    movq %rax, %r15
+; FALLBACK13-NEXT:    shrdq %cl, %r10, %r15
+; FALLBACK13-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK13-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT:    sarq %cl, %r11
+; FALLBACK13-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK13-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK13-NEXT:    movq %r11, 56(%rdx)
+; FALLBACK13-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK13-NEXT:    movq %rbx, 40(%rdx)
+; FALLBACK13-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK13-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK13-NEXT:    movq %r14, (%rdx)
+; FALLBACK13-NEXT:    popq %rbx
+; FALLBACK13-NEXT:    popq %r14
+; FALLBACK13-NEXT:    popq %r15
+; FALLBACK13-NEXT:    vzeroupper
+; FALLBACK13-NEXT:    retq
+;
+; FALLBACK14-LABEL: ashr_64bytes:
+; FALLBACK14:       # %bb.0:
+; FALLBACK14-NEXT:    pushq %rbp
+; FALLBACK14-NEXT:    pushq %r15
+; FALLBACK14-NEXT:    pushq %r14
+; FALLBACK14-NEXT:    pushq %r13
+; FALLBACK14-NEXT:    pushq %r12
+; FALLBACK14-NEXT:    pushq %rbx
+; FALLBACK14-NEXT:    pushq %rax
+; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK14-NEXT:    vmovups 32(%rdi), %xmm1
+; FALLBACK14-NEXT:    movq 48(%rdi), %rcx
+; FALLBACK14-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK14-NEXT:    movl (%rsi), %eax
+; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    sarq $63, %rdi
+; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    leal (,%rax,8), %esi
+; FALLBACK14-NEXT:    andl $56, %esi
+; FALLBACK14-NEXT:    andl $56, %eax
+; FALLBACK14-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK14-NEXT:    movq -112(%rsp,%rax), %rcx
+; FALLBACK14-NEXT:    movq -104(%rsp,%rax), %rdi
+; FALLBACK14-NEXT:    shrxq %rsi, %rdi, %r12
+; FALLBACK14-NEXT:    movq -96(%rsp,%rax), %r13
+; FALLBACK14-NEXT:    shrxq %rsi, %rcx, %r9
+; FALLBACK14-NEXT:    movq -88(%rsp,%rax), %r10
+; FALLBACK14-NEXT:    shrxq %rsi, %r10, %r14
+; FALLBACK14-NEXT:    shrxq %rsi, %r13, %r15
+; FALLBACK14-NEXT:    movl %esi, %ebx
+; FALLBACK14-NEXT:    notb %bl
+; FALLBACK14-NEXT:    movq -120(%rsp,%rax), %rbp
+; FALLBACK14-NEXT:    leaq (%rbp,%rbp), %r8
+; FALLBACK14-NEXT:    shlxq %rbx, %r8, %r8
+; FALLBACK14-NEXT:    orq %r11, %r8
+; FALLBACK14-NEXT:    leaq (%r13,%r13), %r11
+; FALLBACK14-NEXT:    shlxq %rbx, %r11, %r11
+; FALLBACK14-NEXT:    orq %r12, %r11
+; FALLBACK14-NEXT:    movq -80(%rsp,%rax), %r12
+; FALLBACK14-NEXT:    shrxq %rsi, %r12, %r13
+; FALLBACK14-NEXT:    shrxq %rsi, %rbp, %rbp
+; FALLBACK14-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK14-NEXT:    sarxq %rsi, %rax, %rsi
+; FALLBACK14-NEXT:    addq %rdi, %rdi
+; FALLBACK14-NEXT:    shlxq %rbx, %rdi, %rdi
+; FALLBACK14-NEXT:    orq %r9, %rdi
+; FALLBACK14-NEXT:    leaq (%r12,%r12), %r9
+; FALLBACK14-NEXT:    shlxq %rbx, %r9, %r9
+; FALLBACK14-NEXT:    orq %r14, %r9
+; FALLBACK14-NEXT:    addq %r10, %r10
+; FALLBACK14-NEXT:    shlxq %rbx, %r10, %r10
+; FALLBACK14-NEXT:    orq %r15, %r10
+; FALLBACK14-NEXT:    addq %rax, %rax
+; FALLBACK14-NEXT:    shlxq %rbx, %rax, %rax
+; FALLBACK14-NEXT:    orq %r13, %rax
+; FALLBACK14-NEXT:    addq %rcx, %rcx
+; FALLBACK14-NEXT:    shlxq %rbx, %rcx, %rcx
+; FALLBACK14-NEXT:    orq %rbp, %rcx
+; FALLBACK14-NEXT:    movq %rsi, 56(%rdx)
+; FALLBACK14-NEXT:    movq %rcx, 8(%rdx)
+; FALLBACK14-NEXT:    movq %rax, 48(%rdx)
+; FALLBACK14-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK14-NEXT:    movq %r9, 40(%rdx)
+; FALLBACK14-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK14-NEXT:    movq %r8, (%rdx)
+; FALLBACK14-NEXT:    addq $8, %rsp
+; FALLBACK14-NEXT:    popq %rbx
+; FALLBACK14-NEXT:    popq %r12
+; FALLBACK14-NEXT:    popq %r13
+; FALLBACK14-NEXT:    popq %r14
+; FALLBACK14-NEXT:    popq %r15
+; FALLBACK14-NEXT:    popq %rbp
+; FALLBACK14-NEXT:    vzeroupper
+; FALLBACK14-NEXT:    retq
+;
+; FALLBACK15-LABEL: ashr_64bytes:
+; FALLBACK15:       # %bb.0:
+; FALLBACK15-NEXT:    pushq %r15
+; FALLBACK15-NEXT:    pushq %r14
+; FALLBACK15-NEXT:    pushq %rbx
+; FALLBACK15-NEXT:    vmovups (%rdi), %ymm0
+; FALLBACK15-NEXT:    vmovups 32(%rdi), %xmm1
+; FALLBACK15-NEXT:    movq 48(%rdi), %rcx
+; FALLBACK15-NEXT:    movq 56(%rdi), %rdi
+; FALLBACK15-NEXT:    movl (%rsi), %eax
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    sarq $63, %rdi
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK15-NEXT:    andl $56, %ecx
+; FALLBACK15-NEXT:    andl $56, %eax
+; FALLBACK15-NEXT:    movq -96(%rsp,%rax), %rdi
+; FALLBACK15-NEXT:    movq -104(%rsp,%rax), %r9
+; FALLBACK15-NEXT:    movq %r9, %rsi
+; FALLBACK15-NEXT:    shrdq %cl, %rdi, %rsi
+; FALLBACK15-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK15-NEXT:    movq %r10, %r8
+; FALLBACK15-NEXT:    shrdq %cl, %r9, %r8
+; FALLBACK15-NEXT:    movq -80(%rsp,%rax), %r9
+; FALLBACK15-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK15-NEXT:    movq %r11, %rbx
+; FALLBACK15-NEXT:    shrdq %cl, %r9, %rbx
+; FALLBACK15-NEXT:    shrdq %cl, %r11, %rdi
+; FALLBACK15-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK15-NEXT:    shrdq %cl, %r11, %r9
+; FALLBACK15-NEXT:    movq -128(%rsp,%rax), %r14
+; FALLBACK15-NEXT:    movq -120(%rsp,%rax), %rax
+; FALLBACK15-NEXT:    movq %rax, %r15
+; FALLBACK15-NEXT:    shrdq %cl, %r10, %r15
+; FALLBACK15-NEXT:    sarxq %rcx, %r11, %r10
+; FALLBACK15-NEXT:    # kill: def $cl killed $cl killed $rcx
+; FALLBACK15-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK15-NEXT:    movq %r15, 8(%rdx)
+; FALLBACK15-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK15-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK15-NEXT:    movq %rbx, 40(%rdx)
+; FALLBACK15-NEXT:    movq %r8, 16(%rdx)
+; FALLBACK15-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK15-NEXT:    movq %r14, (%rdx)
+; FALLBACK15-NEXT:    movq %r10, 56(%rdx)
+; FALLBACK15-NEXT:    popq %rbx
+; FALLBACK15-NEXT:    popq %r14
+; FALLBACK15-NEXT:    popq %r15
+; FALLBACK15-NEXT:    vzeroupper
+; FALLBACK15-NEXT:    retq
+;
+; FALLBACK16-LABEL: ashr_64bytes:
+; FALLBACK16:       # %bb.0:
+; FALLBACK16-NEXT:    pushl %ebp
+; FALLBACK16-NEXT:    pushl %ebx
+; FALLBACK16-NEXT:    pushl %edi
+; FALLBACK16-NEXT:    pushl %esi
+; FALLBACK16-NEXT:    subl $204, %esp
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT:    movl (%ecx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 4(%ecx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 8(%ecx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 12(%ecx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 16(%ecx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 20(%ecx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 24(%ecx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 28(%ecx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 32(%ecx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 36(%ecx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 40(%ecx), %ebx
+; FALLBACK16-NEXT:    movl 44(%ecx), %edi
+; FALLBACK16-NEXT:    movl 48(%ecx), %esi
+; FALLBACK16-NEXT:    movl 52(%ecx), %edx
+; FALLBACK16-NEXT:    movl 56(%ecx), %eax
+; FALLBACK16-NEXT:    movl 60(%ecx), %ecx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK16-NEXT:    movl (%ebp), %ebp
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    sarl $31, %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebp, %ecx
+; FALLBACK16-NEXT:    movl %ebp, %esi
+; FALLBACK16-NEXT:    andl $60, %esi
+; FALLBACK16-NEXT:    movl 68(%esp,%esi), %edx
+; FALLBACK16-NEXT:    shll $3, %ecx
+; FALLBACK16-NEXT:    andl $24, %ecx
+; FALLBACK16-NEXT:    movl %edx, %eax
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl 72(%esp,%esi), %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    addl %edi, %edi
+; FALLBACK16-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT:    movl %ecx, %ebx
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT:    notb %ch
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl %eax, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 64(%esp,%esi), %eax
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    addl %edx, %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    orl %eax, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 76(%esp,%esi), %ebp
+; FALLBACK16-NEXT:    movl %ebp, %edx
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    movl 80(%esp,%esi), %edi
+; FALLBACK16-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    orl %edx, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    addl %ebp, %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %esi, %edx
+; FALLBACK16-NEXT:    movl 84(%esp,%esi), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl 88(%esp,%esi), %esi
+; FALLBACK16-NEXT:    leal (%esi,%esi), %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT:    addl %ebx, %ebx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %edi, %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %edx, %eax
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 92(%esp,%edx), %ebp
+; FALLBACK16-NEXT:    movl %ebp, %edx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    movl 96(%esp,%eax), %edi
+; FALLBACK16-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    orl %edx, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    addl %ebp, %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %esi, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    movl 100(%esp,%edx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl 104(%esp,%edx), %esi
+; FALLBACK16-NEXT:    leal (%esi,%esi), %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %ebx, %edx
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT:    addl %ebx, %ebx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %edi, %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK16-NEXT:    movl 108(%esp,%ebp), %edi
+; FALLBACK16-NEXT:    movl %edi, %eax
+; FALLBACK16-NEXT:    movl %edx, %ebx
+; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl 112(%esp,%ebp), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %ebp, %edx
+; FALLBACK16-NEXT:    leal (%ecx,%ecx), %ebp
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    addl %edi, %edi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl %esi, %edi
+; FALLBACK16-NEXT:    movl 116(%esp,%edx), %esi
+; FALLBACK16-NEXT:    movl %esi, %eax
+; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl 120(%esp,%edx), %edx
+; FALLBACK16-NEXT:    leal (%edx,%edx), %ebp
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    addl %esi, %esi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %eax, %esi
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    movl %edx, %eax
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    movl 124(%esp,%edx), %ebx
+; FALLBACK16-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    orl %eax, %edx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK16-NEXT:    sarl %cl, %ebx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl %ebx, 60(%eax)
+; FALLBACK16-NEXT:    movl %edx, 56(%eax)
+; FALLBACK16-NEXT:    movl %esi, 48(%eax)
+; FALLBACK16-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK16-NEXT:    movl %edi, 40(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, (%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK16-NEXT:    addl $204, %esp
+; FALLBACK16-NEXT:    popl %esi
+; FALLBACK16-NEXT:    popl %edi
+; FALLBACK16-NEXT:    popl %ebx
+; FALLBACK16-NEXT:    popl %ebp
+; FALLBACK16-NEXT:    retl
+;
+; FALLBACK17-LABEL: ashr_64bytes:
+; FALLBACK17:       # %bb.0:
+; FALLBACK17-NEXT:    pushl %ebp
+; FALLBACK17-NEXT:    pushl %ebx
+; FALLBACK17-NEXT:    pushl %edi
+; FALLBACK17-NEXT:    pushl %esi
+; FALLBACK17-NEXT:    subl $188, %esp
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT:    movl (%eax), %ecx
+; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 4(%eax), %ecx
+; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 8(%eax), %ecx
+; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 12(%eax), %ecx
+; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 16(%eax), %ecx
+; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 20(%eax), %ecx
+; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 24(%eax), %ecx
+; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 28(%eax), %ecx
+; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 32(%eax), %ecx
+; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 36(%eax), %ecx
+; FALLBACK17-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 40(%eax), %ebp
+; FALLBACK17-NEXT:    movl 44(%eax), %ebx
+; FALLBACK17-NEXT:    movl 48(%eax), %edi
+; FALLBACK17-NEXT:    movl 52(%eax), %esi
+; FALLBACK17-NEXT:    movl 56(%eax), %edx
+; FALLBACK17-NEXT:    movl 60(%eax), %eax
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT:    movl (%ecx), %ecx
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl (%esp), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    sarl $31, %eax
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ecx, %ebp
+; FALLBACK17-NEXT:    andl $60, %ebp
+; FALLBACK17-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK17-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shll $3, %ecx
+; FALLBACK17-NEXT:    andl $24, %ecx
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK17-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %esi
+; FALLBACK17-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK17-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edx
+; FALLBACK17-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK17-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edx
+; FALLBACK17-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 88(%esp,%ebp), %esi
+; FALLBACK17-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edx
+; FALLBACK17-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl %esi, %edx
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK17-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edi
+; FALLBACK17-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 104(%esp,%ebp), %edx
+; FALLBACK17-NEXT:    movl 100(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    movl %eax, %edi
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT:    movl 48(%esp,%ebp), %ebx
+; FALLBACK17-NEXT:    movl 108(%esp,%ebp), %eax
+; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK17-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK17-NEXT:    sarl %cl, %eax
+; FALLBACK17-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK17-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK17-NEXT:    movl %edi, 52(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK17-NEXT:    movl %ebx, (%ebp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK17-NEXT:    addl $188, %esp
+; FALLBACK17-NEXT:    popl %esi
+; FALLBACK17-NEXT:    popl %edi
+; FALLBACK17-NEXT:    popl %ebx
+; FALLBACK17-NEXT:    popl %ebp
+; FALLBACK17-NEXT:    retl
+;
+; FALLBACK18-LABEL: ashr_64bytes:
+; FALLBACK18:       # %bb.0:
+; FALLBACK18-NEXT:    pushl %ebp
+; FALLBACK18-NEXT:    pushl %ebx
+; FALLBACK18-NEXT:    pushl %edi
+; FALLBACK18-NEXT:    pushl %esi
+; FALLBACK18-NEXT:    subl $204, %esp
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl (%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 4(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 8(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 12(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 16(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 20(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 24(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 28(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 32(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 36(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 40(%eax), %ebp
+; FALLBACK18-NEXT:    movl 44(%eax), %ebx
+; FALLBACK18-NEXT:    movl 48(%eax), %edi
+; FALLBACK18-NEXT:    movl 52(%eax), %esi
+; FALLBACK18-NEXT:    movl 56(%eax), %edx
+; FALLBACK18-NEXT:    movl 60(%eax), %ecx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl (%eax), %eax
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    sarl $31, %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %eax, %ecx
+; FALLBACK18-NEXT:    leal (,%eax,8), %edx
+; FALLBACK18-NEXT:    andl $24, %edx
+; FALLBACK18-NEXT:    andl $60, %ecx
+; FALLBACK18-NEXT:    movl 68(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    movl 72(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl %edx, %ebx
+; FALLBACK18-NEXT:    notb %bl
+; FALLBACK18-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    addl %esi, %esi
+; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    orl %edi, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 80(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    movl 76(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    orl %eax, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 88(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    movl 84(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    orl %esi, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 96(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    movl 92(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    orl %eax, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 104(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    movl 100(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    addl %edi, %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    orl %esi, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 112(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    movl 108(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    movl %ecx, %edi
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, %esi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT:    addl %esi, %esi
+; FALLBACK18-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK18-NEXT:    orl %ecx, %esi
+; FALLBACK18-NEXT:    movl 120(%esp,%edi), %ebp
+; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT:    shlxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT:    movl 116(%esp,%edi), %eax
+; FALLBACK18-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK18-NEXT:    orl %edi, %ecx
+; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    addl %eax, %eax
+; FALLBACK18-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrxl %edx, %ebp, %eax
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK18-NEXT:    movl 124(%esp,%ebp), %ebp
+; FALLBACK18-NEXT:    sarxl %edx, %ebp, %edx
+; FALLBACK18-NEXT:    addl %ebp, %ebp
+; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %ebx
+; FALLBACK18-NEXT:    orl %eax, %ebx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT:    movl %edx, 60(%eax)
+; FALLBACK18-NEXT:    movl %ebx, 56(%eax)
+; FALLBACK18-NEXT:    movl %edi, 48(%eax)
+; FALLBACK18-NEXT:    movl %ecx, 52(%eax)
+; FALLBACK18-NEXT:    movl %esi, 40(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, (%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK18-NEXT:    addl $204, %esp
+; FALLBACK18-NEXT:    popl %esi
+; FALLBACK18-NEXT:    popl %edi
+; FALLBACK18-NEXT:    popl %ebx
+; FALLBACK18-NEXT:    popl %ebp
+; FALLBACK18-NEXT:    retl
+;
+; FALLBACK19-LABEL: ashr_64bytes:
+; FALLBACK19:       # %bb.0:
+; FALLBACK19-NEXT:    pushl %ebp
+; FALLBACK19-NEXT:    pushl %ebx
+; FALLBACK19-NEXT:    pushl %edi
+; FALLBACK19-NEXT:    pushl %esi
+; FALLBACK19-NEXT:    subl $188, %esp
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT:    movl (%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 4(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 8(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 12(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 16(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 20(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 24(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 28(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 32(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 36(%eax), %ecx
+; FALLBACK19-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 40(%eax), %ebp
+; FALLBACK19-NEXT:    movl 44(%eax), %ebx
+; FALLBACK19-NEXT:    movl 48(%eax), %edi
+; FALLBACK19-NEXT:    movl 52(%eax), %esi
+; FALLBACK19-NEXT:    movl 56(%eax), %edx
+; FALLBACK19-NEXT:    movl 60(%eax), %eax
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT:    movl (%ecx), %ecx
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl (%esp), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    sarl $31, %eax
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ecx, %ebp
+; FALLBACK19-NEXT:    andl $60, %ebp
+; FALLBACK19-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK19-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shll $3, %ecx
+; FALLBACK19-NEXT:    andl $24, %ecx
+; FALLBACK19-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK19-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %esi
+; FALLBACK19-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK19-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK19-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 88(%esp,%ebp), %ebx
+; FALLBACK19-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK19-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK19-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK19-NEXT:    movl 104(%esp,%ebp), %eax
+; FALLBACK19-NEXT:    movl 100(%esp,%ebp), %edi
+; FALLBACK19-NEXT:    movl %edi, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK19-NEXT:    movl 48(%esp,%ebp), %edi
+; FALLBACK19-NEXT:    movl 108(%esp,%ebp), %ebp
+; FALLBACK19-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    shrdl %cl, %ebp, %eax
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT:    movl %eax, 56(%ebp)
+; FALLBACK19-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK19-NEXT:    movl %edx, 52(%ebp)
+; FALLBACK19-NEXT:    movl %ebx, 40(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK19-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK19-NEXT:    sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK19-NEXT:    movl %edi, (%ebp)
+; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT:    movl %ecx, 4(%ebp)
+; FALLBACK19-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK19-NEXT:    addl $188, %esp
+; FALLBACK19-NEXT:    popl %esi
+; FALLBACK19-NEXT:    popl %edi
+; FALLBACK19-NEXT:    popl %ebx
+; FALLBACK19-NEXT:    popl %ebp
+; FALLBACK19-NEXT:    retl
+;
+; FALLBACK20-LABEL: ashr_64bytes:
+; FALLBACK20:       # %bb.0:
+; FALLBACK20-NEXT:    pushl %ebp
+; FALLBACK20-NEXT:    pushl %ebx
+; FALLBACK20-NEXT:    pushl %edi
+; FALLBACK20-NEXT:    pushl %esi
+; FALLBACK20-NEXT:    subl $204, %esp
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    movups (%ecx), %xmm0
+; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT:    movups 32(%ecx), %xmm2
+; FALLBACK20-NEXT:    movl 48(%ecx), %edx
+; FALLBACK20-NEXT:    movl 52(%ecx), %esi
+; FALLBACK20-NEXT:    movl 56(%ecx), %edi
+; FALLBACK20-NEXT:    movl 60(%ecx), %ecx
+; FALLBACK20-NEXT:    movl (%eax), %eax
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    sarl $31, %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, %esi
+; FALLBACK20-NEXT:    andl $60, %esi
+; FALLBACK20-NEXT:    movl 68(%esp,%esi), %edx
+; FALLBACK20-NEXT:    shll $3, %eax
+; FALLBACK20-NEXT:    andl $24, %eax
+; FALLBACK20-NEXT:    movl %edx, %edi
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    movl 72(%esp,%esi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT:    movb %al, %ch
+; FALLBACK20-NEXT:    notb %ch
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %edi, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 64(%esp,%esi), %edi
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    addl %edx, %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    orl %edi, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 76(%esp,%esi), %edx
+; FALLBACK20-NEXT:    movl %edx, %ebp
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 80(%esp,%esi), %edi
+; FALLBACK20-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %ebp, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    addl %edx, %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    orl %ebx, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 84(%esp,%esi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %ebp
+; FALLBACK20-NEXT:    movl %eax, %edx
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 88(%esp,%esi), %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    addl %eax, %eax
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    orl %ebp, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %edi, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 92(%esp,%esi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %ebp
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 96(%esp,%esi), %edi
+; FALLBACK20-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    orl %ebp, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %eax, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 100(%esp,%esi), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %ebp
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 104(%esp,%esi), %edx
+; FALLBACK20-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    orl %ebp, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %edi, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 108(%esp,%esi), %edi
+; FALLBACK20-NEXT:    movl %edi, %ebp
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    orl %ebp, %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    addl %edi, %edi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    orl %edx, %edi
+; FALLBACK20-NEXT:    movl %esi, %edx
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 116(%esp,%esi), %esi
+; FALLBACK20-NEXT:    movl %esi, %ebx
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    movl 120(%esp,%edx), %eax
+; FALLBACK20-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    orl %ebx, %ebp
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    addl %esi, %esi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl %ebx, %esi
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT:    movl 124(%esp,%edx), %ebx
+; FALLBACK20-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    orl %eax, %edx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK20-NEXT:    sarl %cl, %ebx
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movl %ebx, 60(%eax)
+; FALLBACK20-NEXT:    movl %edx, 56(%eax)
+; FALLBACK20-NEXT:    movl %esi, 48(%eax)
+; FALLBACK20-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK20-NEXT:    movl %edi, 40(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, (%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK20-NEXT:    addl $204, %esp
+; FALLBACK20-NEXT:    popl %esi
+; FALLBACK20-NEXT:    popl %edi
+; FALLBACK20-NEXT:    popl %ebx
+; FALLBACK20-NEXT:    popl %ebp
+; FALLBACK20-NEXT:    retl
+;
+; FALLBACK21-LABEL: ashr_64bytes:
+; FALLBACK21:       # %bb.0:
+; FALLBACK21-NEXT:    pushl %ebp
+; FALLBACK21-NEXT:    pushl %ebx
+; FALLBACK21-NEXT:    pushl %edi
+; FALLBACK21-NEXT:    pushl %esi
+; FALLBACK21-NEXT:    subl $188, %esp
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT:    movups (%eax), %xmm0
+; FALLBACK21-NEXT:    movups 16(%eax), %xmm1
+; FALLBACK21-NEXT:    movups 32(%eax), %xmm2
+; FALLBACK21-NEXT:    movl 48(%eax), %edx
+; FALLBACK21-NEXT:    movl 52(%eax), %esi
+; FALLBACK21-NEXT:    movl 56(%eax), %edi
+; FALLBACK21-NEXT:    movl 60(%eax), %eax
+; FALLBACK21-NEXT:    movl (%ecx), %ecx
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    sarl $31, %eax
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT:    movl %ecx, %ebp
+; FALLBACK21-NEXT:    andl $60, %ebp
+; FALLBACK21-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK21-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shll $3, %ecx
+; FALLBACK21-NEXT:    andl $24, %ecx
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK21-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK21-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %esi
+; FALLBACK21-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK21-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK21-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 88(%esp,%ebp), %esi
+; FALLBACK21-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl %esi, %edx
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK21-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edi
+; FALLBACK21-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 104(%esp,%ebp), %edx
+; FALLBACK21-NEXT:    movl 100(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    movl %eax, %edi
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT:    movl 48(%esp,%ebp), %ebx
+; FALLBACK21-NEXT:    movl 108(%esp,%ebp), %eax
+; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK21-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK21-NEXT:    sarl %cl, %eax
+; FALLBACK21-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK21-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK21-NEXT:    movl %edi, 52(%ebp)
+; FALLBACK21-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK21-NEXT:    movl %ebx, (%ebp)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK21-NEXT:    addl $188, %esp
+; FALLBACK21-NEXT:    popl %esi
+; FALLBACK21-NEXT:    popl %edi
+; FALLBACK21-NEXT:    popl %ebx
+; FALLBACK21-NEXT:    popl %ebp
+; FALLBACK21-NEXT:    retl
+;
+; FALLBACK22-LABEL: ashr_64bytes:
+; FALLBACK22:       # %bb.0:
+; FALLBACK22-NEXT:    pushl %ebp
+; FALLBACK22-NEXT:    pushl %ebx
+; FALLBACK22-NEXT:    pushl %edi
+; FALLBACK22-NEXT:    pushl %esi
+; FALLBACK22-NEXT:    subl $204, %esp
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT:    movups (%ecx), %xmm0
+; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT:    movups 32(%ecx), %xmm2
+; FALLBACK22-NEXT:    movl 48(%ecx), %edx
+; FALLBACK22-NEXT:    movl 52(%ecx), %esi
+; FALLBACK22-NEXT:    movl 56(%ecx), %edi
+; FALLBACK22-NEXT:    movl 60(%ecx), %ecx
+; FALLBACK22-NEXT:    movl (%eax), %eax
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    sarl $31, %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %eax, %ecx
+; FALLBACK22-NEXT:    leal (,%eax,8), %edx
+; FALLBACK22-NEXT:    andl $24, %edx
+; FALLBACK22-NEXT:    andl $60, %ecx
+; FALLBACK22-NEXT:    movl 68(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    movl 72(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl %edx, %ebx
+; FALLBACK22-NEXT:    notb %bl
+; FALLBACK22-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %eax
+; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    addl %esi, %esi
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK22-NEXT:    orl %edi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 80(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    movl 76(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    orl %eax, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 88(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    movl 84(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    orl %esi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 96(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    movl 92(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    orl %eax, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 104(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    movl 100(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    orl %esi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 112(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK22-NEXT:    movl 108(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    movl %ecx, %edi
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT:    addl %esi, %esi
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK22-NEXT:    orl %ecx, %esi
+; FALLBACK22-NEXT:    movl 120(%esp,%edi), %ebp
+; FALLBACK22-NEXT:    leal (%ebp,%ebp), %ecx
+; FALLBACK22-NEXT:    shlxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT:    movl 116(%esp,%edi), %eax
+; FALLBACK22-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK22-NEXT:    orl %edi, %ecx
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %eax, %eax
+; FALLBACK22-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %edx, %ebp, %eax
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK22-NEXT:    movl 124(%esp,%ebp), %ebp
+; FALLBACK22-NEXT:    sarxl %edx, %ebp, %edx
+; FALLBACK22-NEXT:    addl %ebp, %ebp
+; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebx
+; FALLBACK22-NEXT:    orl %eax, %ebx
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT:    movl %edx, 60(%eax)
+; FALLBACK22-NEXT:    movl %ebx, 56(%eax)
+; FALLBACK22-NEXT:    movl %edi, 48(%eax)
+; FALLBACK22-NEXT:    movl %ecx, 52(%eax)
+; FALLBACK22-NEXT:    movl %esi, 40(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, (%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK22-NEXT:    addl $204, %esp
+; FALLBACK22-NEXT:    popl %esi
+; FALLBACK22-NEXT:    popl %edi
+; FALLBACK22-NEXT:    popl %ebx
+; FALLBACK22-NEXT:    popl %ebp
+; FALLBACK22-NEXT:    retl
+;
+; FALLBACK23-LABEL: ashr_64bytes:
+; FALLBACK23:       # %bb.0:
+; FALLBACK23-NEXT:    pushl %ebp
+; FALLBACK23-NEXT:    pushl %ebx
+; FALLBACK23-NEXT:    pushl %edi
+; FALLBACK23-NEXT:    pushl %esi
+; FALLBACK23-NEXT:    subl $188, %esp
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT:    movups (%eax), %xmm0
+; FALLBACK23-NEXT:    movups 16(%eax), %xmm1
+; FALLBACK23-NEXT:    movups 32(%eax), %xmm2
+; FALLBACK23-NEXT:    movl 48(%eax), %edx
+; FALLBACK23-NEXT:    movl 52(%eax), %esi
+; FALLBACK23-NEXT:    movl 56(%eax), %edi
+; FALLBACK23-NEXT:    movl 60(%eax), %eax
+; FALLBACK23-NEXT:    movl (%ecx), %ecx
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    sarl $31, %eax
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT:    movl %ecx, %ebp
+; FALLBACK23-NEXT:    andl $60, %ebp
+; FALLBACK23-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK23-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shll $3, %ecx
+; FALLBACK23-NEXT:    andl $24, %ecx
+; FALLBACK23-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK23-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, %esi
+; FALLBACK23-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK23-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK23-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 88(%esp,%ebp), %ebx
+; FALLBACK23-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK23-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK23-NEXT:    movl 104(%esp,%ebp), %eax
+; FALLBACK23-NEXT:    movl 100(%esp,%ebp), %edi
+; FALLBACK23-NEXT:    movl %edi, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK23-NEXT:    movl 48(%esp,%ebp), %edi
+; FALLBACK23-NEXT:    movl 108(%esp,%ebp), %ebp
+; FALLBACK23-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK23-NEXT:    shrdl %cl, %ebp, %eax
+; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK23-NEXT:    movl %eax, 56(%ebp)
+; FALLBACK23-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK23-NEXT:    movl %edx, 52(%ebp)
+; FALLBACK23-NEXT:    movl %ebx, 40(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK23-NEXT:    sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK23-NEXT:    movl %edi, (%ebp)
+; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT:    movl %ecx, 4(%ebp)
+; FALLBACK23-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK23-NEXT:    addl $188, %esp
+; FALLBACK23-NEXT:    popl %esi
+; FALLBACK23-NEXT:    popl %edi
+; FALLBACK23-NEXT:    popl %ebx
+; FALLBACK23-NEXT:    popl %ebp
+; FALLBACK23-NEXT:    retl
+;
+; FALLBACK24-LABEL: ashr_64bytes:
+; FALLBACK24:       # %bb.0:
+; FALLBACK24-NEXT:    pushl %ebp
+; FALLBACK24-NEXT:    pushl %ebx
+; FALLBACK24-NEXT:    pushl %edi
+; FALLBACK24-NEXT:    pushl %esi
+; FALLBACK24-NEXT:    subl $204, %esp
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT:    vmovups 32(%ecx), %xmm1
+; FALLBACK24-NEXT:    movl 48(%ecx), %edx
+; FALLBACK24-NEXT:    movl 52(%ecx), %esi
+; FALLBACK24-NEXT:    movl 56(%ecx), %edi
+; FALLBACK24-NEXT:    movl 60(%ecx), %ecx
+; FALLBACK24-NEXT:    movl (%eax), %eax
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    sarl $31, %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, %esi
+; FALLBACK24-NEXT:    andl $60, %esi
+; FALLBACK24-NEXT:    movl 68(%esp,%esi), %edx
+; FALLBACK24-NEXT:    shll $3, %eax
+; FALLBACK24-NEXT:    andl $24, %eax
+; FALLBACK24-NEXT:    movl %edx, %edi
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    movl 72(%esp,%esi), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT:    movb %al, %ch
+; FALLBACK24-NEXT:    notb %ch
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %edi, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 64(%esp,%esi), %edi
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    addl %edx, %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    orl %edi, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 76(%esp,%esi), %edx
+; FALLBACK24-NEXT:    movl %edx, %ebp
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 80(%esp,%esi), %edi
+; FALLBACK24-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %ebp, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    addl %edx, %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    orl %ebx, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 84(%esp,%esi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %ebp
+; FALLBACK24-NEXT:    movl %eax, %edx
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 88(%esp,%esi), %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    addl %eax, %eax
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    orl %ebp, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %edi, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 92(%esp,%esi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %ebp
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 96(%esp,%esi), %edi
+; FALLBACK24-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    orl %ebp, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %eax, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 100(%esp,%esi), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %ebp
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 104(%esp,%esi), %edx
+; FALLBACK24-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    orl %ebp, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %edi, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 108(%esp,%esi), %edi
+; FALLBACK24-NEXT:    movl %edi, %ebp
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    orl %ebp, %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    addl %edi, %edi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    orl %edx, %edi
+; FALLBACK24-NEXT:    movl %esi, %edx
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 116(%esp,%esi), %esi
+; FALLBACK24-NEXT:    movl %esi, %ebx
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    movl 120(%esp,%edx), %eax
+; FALLBACK24-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    orl %ebx, %ebp
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    addl %esi, %esi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl %ebx, %esi
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    movl 124(%esp,%edx), %ebx
+; FALLBACK24-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    orl %eax, %edx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK24-NEXT:    sarl %cl, %ebx
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl %ebx, 60(%eax)
+; FALLBACK24-NEXT:    movl %edx, 56(%eax)
+; FALLBACK24-NEXT:    movl %esi, 48(%eax)
+; FALLBACK24-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK24-NEXT:    movl %edi, 40(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, (%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK24-NEXT:    addl $204, %esp
+; FALLBACK24-NEXT:    popl %esi
+; FALLBACK24-NEXT:    popl %edi
+; FALLBACK24-NEXT:    popl %ebx
+; FALLBACK24-NEXT:    popl %ebp
+; FALLBACK24-NEXT:    vzeroupper
+; FALLBACK24-NEXT:    retl
+;
+; FALLBACK25-LABEL: ashr_64bytes:
+; FALLBACK25:       # %bb.0:
+; FALLBACK25-NEXT:    pushl %ebp
+; FALLBACK25-NEXT:    pushl %ebx
+; FALLBACK25-NEXT:    pushl %edi
+; FALLBACK25-NEXT:    pushl %esi
+; FALLBACK25-NEXT:    subl $188, %esp
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT:    vmovups (%eax), %ymm0
+; FALLBACK25-NEXT:    vmovups 32(%eax), %xmm1
+; FALLBACK25-NEXT:    movl 48(%eax), %edx
+; FALLBACK25-NEXT:    movl 52(%eax), %esi
+; FALLBACK25-NEXT:    movl 56(%eax), %edi
+; FALLBACK25-NEXT:    movl 60(%eax), %eax
+; FALLBACK25-NEXT:    movl (%ecx), %ecx
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    sarl $31, %eax
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT:    movl %ecx, %ebp
+; FALLBACK25-NEXT:    andl $60, %ebp
+; FALLBACK25-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK25-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shll $3, %ecx
+; FALLBACK25-NEXT:    andl $24, %ecx
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK25-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK25-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %esi
+; FALLBACK25-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK25-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK25-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 88(%esp,%ebp), %esi
+; FALLBACK25-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl %esi, %edx
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK25-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edi
+; FALLBACK25-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 104(%esp,%ebp), %edx
+; FALLBACK25-NEXT:    movl 100(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    movl %eax, %edi
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT:    movl 48(%esp,%ebp), %ebx
+; FALLBACK25-NEXT:    movl 108(%esp,%ebp), %eax
+; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK25-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK25-NEXT:    sarl %cl, %eax
+; FALLBACK25-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK25-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK25-NEXT:    movl %edi, 52(%ebp)
+; FALLBACK25-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK25-NEXT:    movl %ebx, (%ebp)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK25-NEXT:    addl $188, %esp
+; FALLBACK25-NEXT:    popl %esi
+; FALLBACK25-NEXT:    popl %edi
+; FALLBACK25-NEXT:    popl %ebx
+; FALLBACK25-NEXT:    popl %ebp
+; FALLBACK25-NEXT:    vzeroupper
+; FALLBACK25-NEXT:    retl
+;
+; FALLBACK26-LABEL: ashr_64bytes:
+; FALLBACK26:       # %bb.0:
+; FALLBACK26-NEXT:    pushl %ebp
+; FALLBACK26-NEXT:    pushl %ebx
+; FALLBACK26-NEXT:    pushl %edi
+; FALLBACK26-NEXT:    pushl %esi
+; FALLBACK26-NEXT:    subl $204, %esp
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT:    vmovups 32(%ecx), %xmm1
+; FALLBACK26-NEXT:    movl 48(%ecx), %edx
+; FALLBACK26-NEXT:    movl 52(%ecx), %esi
+; FALLBACK26-NEXT:    movl 56(%ecx), %edi
+; FALLBACK26-NEXT:    movl 60(%ecx), %ecx
+; FALLBACK26-NEXT:    movl (%eax), %eax
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    sarl $31, %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %eax, %ecx
+; FALLBACK26-NEXT:    leal (,%eax,8), %edx
+; FALLBACK26-NEXT:    andl $24, %edx
+; FALLBACK26-NEXT:    andl $60, %ecx
+; FALLBACK26-NEXT:    movl 68(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    movl 72(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl %edx, %ebx
+; FALLBACK26-NEXT:    notb %bl
+; FALLBACK26-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %eax
+; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    addl %esi, %esi
+; FALLBACK26-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    orl %edi, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 80(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    movl 76(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    orl %eax, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 88(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    movl 84(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    orl %esi, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 96(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    movl 92(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    orl %eax, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 104(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    movl 100(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT:    addl %edi, %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    orl %esi, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 112(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK26-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    movl 108(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    movl %ecx, %edi
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK26-NEXT:    addl %esi, %esi
+; FALLBACK26-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK26-NEXT:    orl %ecx, %esi
+; FALLBACK26-NEXT:    movl 120(%esp,%edi), %ebp
+; FALLBACK26-NEXT:    leal (%ebp,%ebp), %ecx
+; FALLBACK26-NEXT:    shlxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT:    movl 116(%esp,%edi), %eax
+; FALLBACK26-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK26-NEXT:    orl %edi, %ecx
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %eax, %eax
+; FALLBACK26-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %edx, %ebp, %eax
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK26-NEXT:    movl 124(%esp,%ebp), %ebp
+; FALLBACK26-NEXT:    sarxl %edx, %ebp, %edx
+; FALLBACK26-NEXT:    addl %ebp, %ebp
+; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %ebx
+; FALLBACK26-NEXT:    orl %eax, %ebx
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT:    movl %edx, 60(%eax)
+; FALLBACK26-NEXT:    movl %ebx, 56(%eax)
+; FALLBACK26-NEXT:    movl %edi, 48(%eax)
+; FALLBACK26-NEXT:    movl %ecx, 52(%eax)
+; FALLBACK26-NEXT:    movl %esi, 40(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, (%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK26-NEXT:    addl $204, %esp
+; FALLBACK26-NEXT:    popl %esi
+; FALLBACK26-NEXT:    popl %edi
+; FALLBACK26-NEXT:    popl %ebx
+; FALLBACK26-NEXT:    popl %ebp
+; FALLBACK26-NEXT:    vzeroupper
+; FALLBACK26-NEXT:    retl
+;
+; FALLBACK27-LABEL: ashr_64bytes:
+; FALLBACK27:       # %bb.0:
+; FALLBACK27-NEXT:    pushl %ebp
+; FALLBACK27-NEXT:    pushl %ebx
+; FALLBACK27-NEXT:    pushl %edi
+; FALLBACK27-NEXT:    pushl %esi
+; FALLBACK27-NEXT:    subl $188, %esp
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT:    vmovups (%eax), %ymm0
+; FALLBACK27-NEXT:    vmovups 32(%eax), %xmm1
+; FALLBACK27-NEXT:    movl 48(%eax), %edx
+; FALLBACK27-NEXT:    movl 52(%eax), %esi
+; FALLBACK27-NEXT:    movl 56(%eax), %edi
+; FALLBACK27-NEXT:    movl 60(%eax), %eax
+; FALLBACK27-NEXT:    movl (%ecx), %ecx
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    sarl $31, %eax
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT:    movl %ecx, %ebp
+; FALLBACK27-NEXT:    andl $60, %ebp
+; FALLBACK27-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK27-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shll $3, %ecx
+; FALLBACK27-NEXT:    andl $24, %ecx
+; FALLBACK27-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK27-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, %esi
+; FALLBACK27-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK27-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK27-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK27-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 88(%esp,%ebp), %ebx
+; FALLBACK27-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK27-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK27-NEXT:    movl 104(%esp,%ebp), %eax
+; FALLBACK27-NEXT:    movl 100(%esp,%ebp), %edi
+; FALLBACK27-NEXT:    movl %edi, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK27-NEXT:    movl 48(%esp,%ebp), %edi
+; FALLBACK27-NEXT:    movl 108(%esp,%ebp), %ebp
+; FALLBACK27-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK27-NEXT:    shrdl %cl, %ebp, %eax
+; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK27-NEXT:    movl %eax, 56(%ebp)
+; FALLBACK27-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK27-NEXT:    movl %edx, 52(%ebp)
+; FALLBACK27-NEXT:    movl %ebx, 40(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK27-NEXT:    sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK27-NEXT:    movl %edi, (%ebp)
+; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT:    movl %ecx, 4(%ebp)
+; FALLBACK27-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK27-NEXT:    addl $188, %esp
+; FALLBACK27-NEXT:    popl %esi
+; FALLBACK27-NEXT:    popl %edi
+; FALLBACK27-NEXT:    popl %ebx
+; FALLBACK27-NEXT:    popl %ebp
+; FALLBACK27-NEXT:    vzeroupper
+; FALLBACK27-NEXT:    retl
+;
+; FALLBACK28-LABEL: ashr_64bytes:
+; FALLBACK28:       # %bb.0:
+; FALLBACK28-NEXT:    pushl %ebp
+; FALLBACK28-NEXT:    pushl %ebx
+; FALLBACK28-NEXT:    pushl %edi
+; FALLBACK28-NEXT:    pushl %esi
+; FALLBACK28-NEXT:    subl $204, %esp
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK28-NEXT:    vmovups 32(%ecx), %xmm1
+; FALLBACK28-NEXT:    movl 48(%ecx), %edx
+; FALLBACK28-NEXT:    movl 52(%ecx), %esi
+; FALLBACK28-NEXT:    movl 56(%ecx), %edi
+; FALLBACK28-NEXT:    movl 60(%ecx), %ecx
+; FALLBACK28-NEXT:    movl (%eax), %eax
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    sarl $31, %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, %esi
+; FALLBACK28-NEXT:    andl $60, %esi
+; FALLBACK28-NEXT:    movl 68(%esp,%esi), %edx
+; FALLBACK28-NEXT:    shll $3, %eax
+; FALLBACK28-NEXT:    andl $24, %eax
+; FALLBACK28-NEXT:    movl %edx, %edi
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    movl 72(%esp,%esi), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT:    movb %al, %ch
+; FALLBACK28-NEXT:    notb %ch
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %edi, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 64(%esp,%esi), %edi
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    addl %edx, %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    orl %edi, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 76(%esp,%esi), %edx
+; FALLBACK28-NEXT:    movl %edx, %ebp
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 80(%esp,%esi), %edi
+; FALLBACK28-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %ebp, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    addl %edx, %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    orl %ebx, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 84(%esp,%esi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %ebp
+; FALLBACK28-NEXT:    movl %eax, %edx
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 88(%esp,%esi), %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    addl %eax, %eax
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    orl %ebp, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %edi, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 92(%esp,%esi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %ebp
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 96(%esp,%esi), %edi
+; FALLBACK28-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    orl %ebp, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %eax, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 100(%esp,%esi), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %ebp
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 104(%esp,%esi), %edx
+; FALLBACK28-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    orl %ebp, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %edi, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 108(%esp,%esi), %edi
+; FALLBACK28-NEXT:    movl %edi, %ebp
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    orl %ebp, %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    addl %edi, %edi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    orl %edx, %edi
+; FALLBACK28-NEXT:    movl %esi, %edx
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 116(%esp,%esi), %esi
+; FALLBACK28-NEXT:    movl %esi, %ebx
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    movl 120(%esp,%edx), %eax
+; FALLBACK28-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    orl %ebx, %ebp
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    addl %esi, %esi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl %ebx, %esi
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    movl 124(%esp,%edx), %ebx
+; FALLBACK28-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    orl %eax, %edx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK28-NEXT:    sarl %cl, %ebx
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movl %ebx, 60(%eax)
+; FALLBACK28-NEXT:    movl %edx, 56(%eax)
+; FALLBACK28-NEXT:    movl %esi, 48(%eax)
+; FALLBACK28-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK28-NEXT:    movl %edi, 40(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, (%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK28-NEXT:    addl $204, %esp
+; FALLBACK28-NEXT:    popl %esi
+; FALLBACK28-NEXT:    popl %edi
+; FALLBACK28-NEXT:    popl %ebx
+; FALLBACK28-NEXT:    popl %ebp
+; FALLBACK28-NEXT:    vzeroupper
+; FALLBACK28-NEXT:    retl
+;
+; FALLBACK29-LABEL: ashr_64bytes:
+; FALLBACK29:       # %bb.0:
+; FALLBACK29-NEXT:    pushl %ebp
+; FALLBACK29-NEXT:    pushl %ebx
+; FALLBACK29-NEXT:    pushl %edi
+; FALLBACK29-NEXT:    pushl %esi
+; FALLBACK29-NEXT:    subl $188, %esp
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT:    vmovups (%eax), %ymm0
+; FALLBACK29-NEXT:    vmovups 32(%eax), %xmm1
+; FALLBACK29-NEXT:    movl 48(%eax), %edx
+; FALLBACK29-NEXT:    movl 52(%eax), %esi
+; FALLBACK29-NEXT:    movl 56(%eax), %edi
+; FALLBACK29-NEXT:    movl 60(%eax), %eax
+; FALLBACK29-NEXT:    movl (%ecx), %ecx
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    sarl $31, %eax
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT:    movl %ecx, %ebp
+; FALLBACK29-NEXT:    andl $60, %ebp
+; FALLBACK29-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK29-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shll $3, %ecx
+; FALLBACK29-NEXT:    andl $24, %ecx
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK29-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK29-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %esi
+; FALLBACK29-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK29-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK29-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 88(%esp,%ebp), %esi
+; FALLBACK29-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl %esi, %edx
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK29-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edi
+; FALLBACK29-NEXT:    shrdl %cl, %esi, %edi
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 104(%esp,%ebp), %edx
+; FALLBACK29-NEXT:    movl 100(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    movl %eax, %edi
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT:    movl 48(%esp,%ebp), %ebx
+; FALLBACK29-NEXT:    movl 108(%esp,%ebp), %eax
+; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT:    movl %edx, 56(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK29-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK29-NEXT:    sarl %cl, %eax
+; FALLBACK29-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK29-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK29-NEXT:    movl %edi, 52(%ebp)
+; FALLBACK29-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 40(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK29-NEXT:    movl %ebx, (%ebp)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK29-NEXT:    addl $188, %esp
+; FALLBACK29-NEXT:    popl %esi
+; FALLBACK29-NEXT:    popl %edi
+; FALLBACK29-NEXT:    popl %ebx
+; FALLBACK29-NEXT:    popl %ebp
+; FALLBACK29-NEXT:    vzeroupper
+; FALLBACK29-NEXT:    retl
+;
+; FALLBACK30-LABEL: ashr_64bytes:
+; FALLBACK30:       # %bb.0:
+; FALLBACK30-NEXT:    pushl %ebp
+; FALLBACK30-NEXT:    pushl %ebx
+; FALLBACK30-NEXT:    pushl %edi
+; FALLBACK30-NEXT:    pushl %esi
+; FALLBACK30-NEXT:    subl $204, %esp
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK30-NEXT:    vmovups 32(%ecx), %xmm1
+; FALLBACK30-NEXT:    movl 48(%ecx), %edx
+; FALLBACK30-NEXT:    movl 52(%ecx), %esi
+; FALLBACK30-NEXT:    movl 56(%ecx), %edi
+; FALLBACK30-NEXT:    movl 60(%ecx), %ecx
+; FALLBACK30-NEXT:    movl (%eax), %eax
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    sarl $31, %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %eax, %ecx
+; FALLBACK30-NEXT:    leal (,%eax,8), %edx
+; FALLBACK30-NEXT:    andl $24, %edx
+; FALLBACK30-NEXT:    andl $60, %ecx
+; FALLBACK30-NEXT:    movl 68(%esp,%ecx), %esi
+; FALLBACK30-NEXT:    movl 72(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl %edx, %ebx
+; FALLBACK30-NEXT:    notb %bl
+; FALLBACK30-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %eax
+; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    addl %esi, %esi
+; FALLBACK30-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT:    orl %edi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 80(%esp,%ecx), %esi
+; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    movl 76(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    orl %eax, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 88(%esp,%ecx), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    movl 84(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    orl %esi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 96(%esp,%ecx), %esi
+; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    movl 92(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    orl %eax, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 104(%esp,%ecx), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    movl 100(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK30-NEXT:    addl %edi, %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    orl %esi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 112(%esp,%ecx), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK30-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT:    movl 108(%esp,%ecx), %esi
+; FALLBACK30-NEXT:    movl %ecx, %edi
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edx, %esi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK30-NEXT:    addl %esi, %esi
+; FALLBACK30-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK30-NEXT:    orl %ecx, %esi
+; FALLBACK30-NEXT:    movl 120(%esp,%edi), %ebp
+; FALLBACK30-NEXT:    leal (%ebp,%ebp), %ecx
+; FALLBACK30-NEXT:    shlxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT:    movl 116(%esp,%edi), %eax
+; FALLBACK30-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK30-NEXT:    orl %edi, %ecx
+; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %eax, %eax
+; FALLBACK30-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %edx, %ebp, %eax
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK30-NEXT:    movl 124(%esp,%ebp), %ebp
+; FALLBACK30-NEXT:    sarxl %edx, %ebp, %edx
+; FALLBACK30-NEXT:    addl %ebp, %ebp
+; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %ebx
+; FALLBACK30-NEXT:    orl %eax, %ebx
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT:    movl %edx, 60(%eax)
+; FALLBACK30-NEXT:    movl %ebx, 56(%eax)
+; FALLBACK30-NEXT:    movl %edi, 48(%eax)
+; FALLBACK30-NEXT:    movl %ecx, 52(%eax)
+; FALLBACK30-NEXT:    movl %esi, 40(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, (%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK30-NEXT:    addl $204, %esp
+; FALLBACK30-NEXT:    popl %esi
+; FALLBACK30-NEXT:    popl %edi
+; FALLBACK30-NEXT:    popl %ebx
+; FALLBACK30-NEXT:    popl %ebp
+; FALLBACK30-NEXT:    vzeroupper
+; FALLBACK30-NEXT:    retl
+;
+; FALLBACK31-LABEL: ashr_64bytes:
+; FALLBACK31:       # %bb.0:
+; FALLBACK31-NEXT:    pushl %ebp
+; FALLBACK31-NEXT:    pushl %ebx
+; FALLBACK31-NEXT:    pushl %edi
+; FALLBACK31-NEXT:    pushl %esi
+; FALLBACK31-NEXT:    subl $188, %esp
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT:    vmovups (%eax), %ymm0
+; FALLBACK31-NEXT:    vmovups 32(%eax), %xmm1
+; FALLBACK31-NEXT:    movl 48(%eax), %edx
+; FALLBACK31-NEXT:    movl 52(%eax), %esi
+; FALLBACK31-NEXT:    movl 56(%eax), %edi
+; FALLBACK31-NEXT:    movl 60(%eax), %eax
+; FALLBACK31-NEXT:    movl (%ecx), %ecx
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    sarl $31, %eax
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT:    movl %ecx, %ebp
+; FALLBACK31-NEXT:    andl $60, %ebp
+; FALLBACK31-NEXT:    movl 56(%esp,%ebp), %edx
+; FALLBACK31-NEXT:    movl 52(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shll $3, %ecx
+; FALLBACK31-NEXT:    andl $24, %ecx
+; FALLBACK31-NEXT:    shrdl %cl, %edx, %eax
+; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK31-NEXT:    movl 60(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, %esi
+; FALLBACK31-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK31-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 72(%esp,%ebp), %esi
+; FALLBACK31-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 80(%esp,%ebp), %edi
+; FALLBACK31-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %edi, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 88(%esp,%ebp), %ebx
+; FALLBACK31-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %ebx, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK31-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %ebx
+; FALLBACK31-NEXT:    movl 104(%esp,%ebp), %eax
+; FALLBACK31-NEXT:    movl 100(%esp,%ebp), %edi
+; FALLBACK31-NEXT:    movl %edi, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK31-NEXT:    movl 48(%esp,%ebp), %edi
+; FALLBACK31-NEXT:    movl 108(%esp,%ebp), %ebp
+; FALLBACK31-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK31-NEXT:    shrdl %cl, %ebp, %eax
+; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK31-NEXT:    movl %eax, 56(%ebp)
+; FALLBACK31-NEXT:    movl %esi, 48(%ebp)
+; FALLBACK31-NEXT:    movl %edx, 52(%ebp)
+; FALLBACK31-NEXT:    movl %ebx, 40(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 44(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 32(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 36(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 24(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 28(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 16(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 20(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 8(%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK31-NEXT:    sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK31-NEXT:    movl %edi, (%ebp)
+; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT:    movl %ecx, 4(%ebp)
+; FALLBACK31-NEXT:    movl %eax, 60(%ebp)
+; FALLBACK31-NEXT:    addl $188, %esp
+; FALLBACK31-NEXT:    popl %esi
+; FALLBACK31-NEXT:    popl %edi
+; FALLBACK31-NEXT:    popl %ebx
+; FALLBACK31-NEXT:    popl %ebp
+; FALLBACK31-NEXT:    vzeroupper
+; FALLBACK31-NEXT:    retl
+  %src = load i512, ptr %src.ptr, align 1
+  %byteOff = load i512, ptr %byteOff.ptr, align 1
+  %bitOff = shl i512 %byteOff, 3
+  %res = ashr i512 %src, %bitOff
+  store i512 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @ashr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
+; X64-SSE2-LABEL: ashr_64bytes_qwordOff:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    pushq %rbx
 ; X64-SSE2-NEXT:    movq (%rdi), %rax
@@ -2394,15 +24296,15 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT:    andl $63, %esi
-; X64-SSE2-NEXT:    movq -128(%rsp,%rsi), %rax
-; X64-SSE2-NEXT:    movq -120(%rsp,%rsi), %rcx
-; X64-SSE2-NEXT:    movq -104(%rsp,%rsi), %rdi
-; X64-SSE2-NEXT:    movq -112(%rsp,%rsi), %r8
-; X64-SSE2-NEXT:    movq -88(%rsp,%rsi), %r9
-; X64-SSE2-NEXT:    movq -96(%rsp,%rsi), %r10
-; X64-SSE2-NEXT:    movq -72(%rsp,%rsi), %r11
-; X64-SSE2-NEXT:    movq -80(%rsp,%rsi), %rsi
+; X64-SSE2-NEXT:    andl $7, %esi
+; X64-SSE2-NEXT:    movq -128(%rsp,%rsi,8), %rax
+; X64-SSE2-NEXT:    movq -120(%rsp,%rsi,8), %rcx
+; X64-SSE2-NEXT:    movq -104(%rsp,%rsi,8), %rdi
+; X64-SSE2-NEXT:    movq -112(%rsp,%rsi,8), %r8
+; X64-SSE2-NEXT:    movq -88(%rsp,%rsi,8), %r9
+; X64-SSE2-NEXT:    movq -96(%rsp,%rsi,8), %r10
+; X64-SSE2-NEXT:    movq -72(%rsp,%rsi,8), %r11
+; X64-SSE2-NEXT:    movq -80(%rsp,%rsi,8), %rsi
 ; X64-SSE2-NEXT:    movq %rsi, 48(%rdx)
 ; X64-SSE2-NEXT:    movq %r11, 56(%rdx)
 ; X64-SSE2-NEXT:    movq %r10, 32(%rdx)
@@ -2414,8 +24316,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-SSE2-NEXT:    popq %rbx
 ; X64-SSE2-NEXT:    retq
 ;
-; X64-SSE42-LABEL: ashr_64bytes:
+; X64-SSE42-LABEL: ashr_64bytes_qwordOff:
 ; X64-SSE42:       # %bb.0:
+; X64-SSE42-NEXT:    pushq %rax
 ; X64-SSE42-NEXT:    movups (%rdi), %xmm0
 ; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
 ; X64-SSE42-NEXT:    movups 32(%rdi), %xmm2
@@ -2424,9 +24327,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-SSE42-NEXT:    movl (%rsi), %esi
 ; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-SSE42-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-SSE42-NEXT:    sarq $63, %rcx
 ; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
@@ -2436,19 +24339,21 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT:    andl $63, %esi
-; X64-SSE42-NEXT:    movups -128(%rsp,%rsi), %xmm0
-; X64-SSE42-NEXT:    movups -112(%rsp,%rsi), %xmm1
-; X64-SSE42-NEXT:    movups -96(%rsp,%rsi), %xmm2
-; X64-SSE42-NEXT:    movups -80(%rsp,%rsi), %xmm3
+; X64-SSE42-NEXT:    andl $7, %esi
+; X64-SSE42-NEXT:    movups -128(%rsp,%rsi,8), %xmm0
+; X64-SSE42-NEXT:    movups -112(%rsp,%rsi,8), %xmm1
+; X64-SSE42-NEXT:    movups -96(%rsp,%rsi,8), %xmm2
+; X64-SSE42-NEXT:    movups -80(%rsp,%rsi,8), %xmm3
 ; X64-SSE42-NEXT:    movups %xmm3, 48(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm2, 32(%rdx)
 ; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
+; X64-SSE42-NEXT:    popq %rax
 ; X64-SSE42-NEXT:    retq
 ;
-; X64-AVX-LABEL: ashr_64bytes:
+; X64-AVX-LABEL: ashr_64bytes_qwordOff:
 ; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    pushq %rax
 ; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
 ; X64-AVX-NEXT:    vmovups 32(%rdi), %xmm1
 ; X64-AVX-NEXT:    movq 48(%rdi), %rax
@@ -2456,7 +24361,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-AVX-NEXT:    movl (%rsi), %esi
 ; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
 ; X64-AVX-NEXT:    sarq $63, %rcx
 ; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
@@ -2467,25 +24372,26 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    andl $63, %esi
-; X64-AVX-NEXT:    vmovups -128(%rsp,%rsi), %xmm0
-; X64-AVX-NEXT:    vmovups -112(%rsp,%rsi), %xmm1
-; X64-AVX-NEXT:    vmovups -96(%rsp,%rsi), %xmm2
-; X64-AVX-NEXT:    vmovups -80(%rsp,%rsi), %xmm3
+; X64-AVX-NEXT:    andl $7, %esi
+; X64-AVX-NEXT:    vmovups -128(%rsp,%rsi,8), %xmm0
+; X64-AVX-NEXT:    vmovups -112(%rsp,%rsi,8), %xmm1
+; X64-AVX-NEXT:    vmovups -96(%rsp,%rsi,8), %xmm2
+; X64-AVX-NEXT:    vmovups -80(%rsp,%rsi,8), %xmm3
 ; X64-AVX-NEXT:    vmovups %xmm3, 48(%rdx)
 ; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
 ; X64-AVX-NEXT:    vmovups %xmm2, 32(%rdx)
 ; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX-NEXT:    popq %rax
 ; X64-AVX-NEXT:    vzeroupper
 ; X64-AVX-NEXT:    retq
 ;
-; X86-SSE2-LABEL: ashr_64bytes:
+; X86-SSE2-LABEL: ashr_64bytes_qwordOff:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    pushl %ebx
 ; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $168, %esp
+; X86-SSE2-NEXT:    subl $188, %esp
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl (%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -2506,7 +24412,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    movl 32(%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SSE2-NEXT:    movl 36(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SSE2-NEXT:    movl 40(%eax), %ebp
 ; X86-SSE2-NEXT:    movl 44(%eax), %ebx
 ; X86-SSE2-NEXT:    movl 48(%eax), %edi
@@ -2520,7 +24426,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
@@ -2558,33 +24464,33 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andl $63, %eax
-; X86-SSE2-NEXT:    movl 40(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    andl $7, %eax
+; X86-SSE2-NEXT:    movl 48(%esp,%eax,8), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 52(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 44(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 60(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 52(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 56(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 48(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 68(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 60(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 64(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 56(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 76(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 68(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 72(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 64(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 84(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 76(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 80(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 72(%esp,%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 84(%esp,%eax), %ebp
-; X86-SSE2-NEXT:    movl 80(%esp,%eax), %ebx
-; X86-SSE2-NEXT:    movl 92(%esp,%eax), %edi
-; X86-SSE2-NEXT:    movl 88(%esp,%eax), %esi
-; X86-SSE2-NEXT:    movl 100(%esp,%eax), %edx
-; X86-SSE2-NEXT:    movl 96(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 92(%esp,%eax,8), %ebp
+; X86-SSE2-NEXT:    movl 88(%esp,%eax,8), %ebx
+; X86-SSE2-NEXT:    movl 100(%esp,%eax,8), %edi
+; X86-SSE2-NEXT:    movl 96(%esp,%eax,8), %esi
+; X86-SSE2-NEXT:    movl 108(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl 104(%esp,%eax,8), %ecx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl %ecx, 56(%eax)
 ; X86-SSE2-NEXT:    movl %edx, 60(%eax)
@@ -2592,7 +24498,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    movl %edi, 52(%eax)
 ; X86-SSE2-NEXT:    movl %ebx, 40(%eax)
 ; X86-SSE2-NEXT:    movl %ebp, 44(%eax)
-; X86-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, 32(%eax)
 ; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, 36(%eax)
@@ -2612,14 +24518,14 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE2-NEXT:    movl %ecx, (%eax)
 ; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    addl $168, %esp
+; X86-SSE2-NEXT:    addl $188, %esp
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    popl %ebx
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-SSE42-LABEL: ashr_64bytes:
+; X86-SSE42-LABEL: ashr_64bytes_qwordOff:
 ; X86-SSE42:       # %bb.0:
 ; X86-SSE42-NEXT:    pushl %ebx
 ; X86-SSE42-NEXT:    pushl %edi
@@ -2640,9 +24546,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE42-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movups %xmm0, (%esp)
+; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
 ; X86-SSE42-NEXT:    sarl $31, %edx
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
@@ -2660,11 +24566,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    andl $63, %ecx
-; X86-SSE42-NEXT:    movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%esp,%ecx), %xmm1
-; X86-SSE42-NEXT:    movups 32(%esp,%ecx), %xmm2
-; X86-SSE42-NEXT:    movups 48(%esp,%ecx), %xmm3
+; X86-SSE42-NEXT:    andl $7, %ecx
+; X86-SSE42-NEXT:    movups (%esp,%ecx,8), %xmm0
+; X86-SSE42-NEXT:    movups 16(%esp,%ecx,8), %xmm1
+; X86-SSE42-NEXT:    movups 32(%esp,%ecx,8), %xmm2
+; X86-SSE42-NEXT:    movups 48(%esp,%ecx,8), %xmm3
 ; X86-SSE42-NEXT:    movups %xmm3, 48(%eax)
 ; X86-SSE42-NEXT:    movups %xmm2, 32(%eax)
 ; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
@@ -2675,7 +24581,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-SSE42-NEXT:    popl %ebx
 ; X86-SSE42-NEXT:    retl
 ;
-; X86-AVX-LABEL: ashr_64bytes:
+; X86-AVX-LABEL: ashr_64bytes_qwordOff:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    pushl %ebx
 ; X86-AVX-NEXT:    pushl %edi
@@ -2695,7 +24601,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-AVX-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    vmovups %xmm1, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    vmovups %ymm0, (%esp)
 ; X86-AVX-NEXT:    sarl $31, %edx
 ; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
@@ -2714,11 +24620,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    andl $63, %ecx
-; X86-AVX-NEXT:    vmovups (%esp,%ecx), %xmm0
-; X86-AVX-NEXT:    vmovups 16(%esp,%ecx), %xmm1
-; X86-AVX-NEXT:    vmovups 32(%esp,%ecx), %xmm2
-; X86-AVX-NEXT:    vmovups 48(%esp,%ecx), %xmm3
+; X86-AVX-NEXT:    andl $7, %ecx
+; X86-AVX-NEXT:    vmovups (%esp,%ecx,8), %xmm0
+; X86-AVX-NEXT:    vmovups 16(%esp,%ecx,8), %xmm1
+; X86-AVX-NEXT:    vmovups 32(%esp,%ecx,8), %xmm2
+; X86-AVX-NEXT:    vmovups 48(%esp,%ecx,8), %xmm3
 ; X86-AVX-NEXT:    vmovups %xmm3, 48(%eax)
 ; X86-AVX-NEXT:    vmovups %xmm2, 32(%eax)
 ; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
@@ -2730,45 +24636,14 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-AVX-NEXT:    vzeroupper
 ; X86-AVX-NEXT:    retl
   %src = load i512, ptr %src.ptr, align 1
-  %byteOff = load i512, ptr %byteOff.ptr, align 1
-  %bitOff = shl i512 %byteOff, 3
+  %qwordOff = load i512, ptr %qwordOff.ptr, align 1
+  %bitOff = shl i512 %qwordOff, 6
   %res = ashr i512 %src, %bitOff
   store i512 %res, ptr %dst, align 1
   ret void
 }
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
-; FALLBACK0: {{.*}}
-; FALLBACK1: {{.*}}
-; FALLBACK10: {{.*}}
-; FALLBACK11: {{.*}}
-; FALLBACK12: {{.*}}
-; FALLBACK13: {{.*}}
-; FALLBACK14: {{.*}}
-; FALLBACK15: {{.*}}
-; FALLBACK16: {{.*}}
-; FALLBACK17: {{.*}}
-; FALLBACK18: {{.*}}
-; FALLBACK19: {{.*}}
-; FALLBACK2: {{.*}}
-; FALLBACK20: {{.*}}
-; FALLBACK21: {{.*}}
-; FALLBACK22: {{.*}}
-; FALLBACK23: {{.*}}
-; FALLBACK24: {{.*}}
-; FALLBACK25: {{.*}}
-; FALLBACK26: {{.*}}
-; FALLBACK27: {{.*}}
-; FALLBACK28: {{.*}}
-; FALLBACK29: {{.*}}
-; FALLBACK3: {{.*}}
-; FALLBACK30: {{.*}}
-; FALLBACK31: {{.*}}
-; FALLBACK4: {{.*}}
-; FALLBACK5: {{.*}}
-; FALLBACK6: {{.*}}
-; FALLBACK7: {{.*}}
-; FALLBACK8: {{.*}}
-; FALLBACK9: {{.*}}
 ; X64: {{.*}}
 ; X86: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index f84131dfc87970..8c0873492ce402 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -588,61 +588,58 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %ah
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $15, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %ah, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%eax), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%ebp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl (%esp), %ebp # 4-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 12(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 12(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -655,50 +652,39 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $44, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movb (%eax), %ah
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movb %ah, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %ah
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $15, %ah
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %ah, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%ecx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %dl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $44, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -711,51 +697,49 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $15, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %bl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%esi), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -768,47 +752,40 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $44, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $15, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%edi,%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %dl, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 8(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -899,66 +876,62 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $60, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $15, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    negb %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movsbl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%ebp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebp), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebp), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 8(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 12(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 12(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $60, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -967,58 +940,45 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-NO-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
 ; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, (%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $15, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movsbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 12(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movsbl %dl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%edi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%edi), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%edi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: shl_16bytes:
@@ -1027,34 +987,32 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $15, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %cl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    negb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %al, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%edx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%edx), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
@@ -1072,7 +1030,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -1081,57 +1039,45 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
 ; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $15, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebp, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbl %dl, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%edi), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%edi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%edi), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%edi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %src = load i128, ptr %src.ptr, align 1
   %bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1218,62 +1164,61 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $15, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%ebp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl (%esp), %ebp # 4-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 12(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 8(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -1286,51 +1231,42 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $44, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $15, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%ecx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %dl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $44, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -1343,52 +1279,52 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $15, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%esi), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %eax, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -1401,48 +1337,43 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $44, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $15, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%edi,%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %dl, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 8(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -1459,35 +1390,34 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %sil, %r9d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%r9), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%r9), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%r8,8), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%r8,8), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%r9), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    andb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%r8,8), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
@@ -1496,142 +1426,124 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%r9), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%r8,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 24(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 16(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %sil, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rsi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rsi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r10,%r10), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r9, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rsi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax,8), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_32bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %sil, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rcx), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rcx), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rsi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -64(%rsp,%rcx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rcx, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -72(%rsp,%rsi,8), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rcx), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rcx, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rsi, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $rax def $rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %sil, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r10d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %r10b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r11,%r11), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r10, %rbx, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
@@ -1640,127 +1552,120 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $108, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%ecx), %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %ch, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %ah
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%eax,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%eax,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%edi,4), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%edx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%esi,4), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 28(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%eax,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $108, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -1775,95 +1680,67 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ebp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edi), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ebp), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%ebp), %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%edi), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%edi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%edi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%ebp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%ebp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%ebp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    notb %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $5, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebp,4), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 24(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 28(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 20(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 20(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
@@ -1879,103 +1756,95 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $84, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $108, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 20(%esp,%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 32(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, (%esp), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl (%esp), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 28(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $84, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $108, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -1988,92 +1857,73 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $88, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%edi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%ecx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $5, %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebp,4), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebp,4), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp,4), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%ebp,%ebp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 24(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 8(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 20(%ebp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -2089,31 +1939,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    negb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movsbq %sil, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    negb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movsbq %cl, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%r10), %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%r10), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    andb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
@@ -2146,79 +1996,70 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    negb %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movsbq %sil, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%rsi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    negb %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movsbq %al, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%rax), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%rsi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%rsi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r8, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%rsi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rdi, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r9, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_32bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    negb %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movsbq %sil, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rsi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, -8(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rsi), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movsbq %cl, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, -16(%rsp,%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rdi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r8, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $rax def $rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rsi, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rcx, %rax
@@ -2226,50 +2067,40 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 16(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbq %sil, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r10d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %r10b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %r10, %rbx, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbq %al, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r8, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
@@ -2278,118 +2109,112 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $108, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%ecx), %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    negb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movsbl %cl, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%ecx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $28, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    negb %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movsbl %al, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %ah
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 24(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 28(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 20(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -2398,7 +2223,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $108, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -2413,99 +2238,70 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ebp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edi), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ebp), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%ebp), %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%edi), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%edi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%edi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%ebp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%ebp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%ebp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    negb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movsbl %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    notb %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $28, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    negb %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movsbl %al, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%ebx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%ebx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 28(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 20(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 24(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%ebx)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
@@ -2519,106 +2315,105 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $88, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $108, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $28, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %cl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%edx), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%esi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%esi), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, %edi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ebx, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, 84(%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%esi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, 92(%esp,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%esi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 28(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $88, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $108, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -2631,95 +2426,75 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $88, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%edi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%ecx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $28, %al
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbl %al, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%esi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%esi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%esi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%esi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbl %al, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%esi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %edx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%esi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%esi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%esi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 20(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -2735,36 +2510,36 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    sarq $63, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %sil, %r9d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%r9), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%r9), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%r8,8), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%r8,8), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%r9), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    andb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%r8,8), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
@@ -2773,145 +2548,130 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%r9), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%r8,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    sarq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    sarq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 24(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 16(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %sil, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rsi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rsi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r10,%r10), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r9, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rsi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax,8), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_32bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarq $63, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %sil, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rcx), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rcx), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rsi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -64(%rsp,%rcx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rcx, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -72(%rsp,%rsi,8), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rcx), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rax, %rcx, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rax, %rsi, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $rax def $rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %sil, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r10d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %r10b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r11,%r11), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r10, %rbx, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
@@ -2920,17 +2680,17 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $108, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%ecx), %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %edx
@@ -2942,7 +2702,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
@@ -2953,95 +2713,94 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $7, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %ch, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%eax,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%eax,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebp,4), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %ah
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%ebp,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%edx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%ebx,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%ebx,4), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 28(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%eax,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $108, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -3088,64 +2847,41 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    notb %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $5, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebp,4), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 24(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 28(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 20(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 20(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
@@ -3161,106 +2897,101 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $84, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $108, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $7, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, 20(%esp,%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 32(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ebx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 28(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $84, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $108, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -3273,93 +3004,79 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%edx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%edx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $5, %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebp,4), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebp,4), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp,4), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%ebp,%ebp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 24(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 8(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 20(%ebp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -3381,6 +3098,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
@@ -3390,6 +3108,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
@@ -3398,18 +3121,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $7, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %r8d
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%r8), %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%r8), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rsi
@@ -3417,7 +3132,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%r8), %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
@@ -3426,7 +3140,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
 ; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
@@ -3478,6 +3192,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
@@ -3488,22 +3203,24 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r12
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
@@ -3511,73 +3228,41 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, (%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    notl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r11,%r11), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rdi), %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r14,%r14), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r12, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rdi), %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rdi), %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%rbp,%rbp), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r13, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r15, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r12, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rdi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbp, 48(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 56(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, 32(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r13
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_64bytes:
@@ -3588,6 +3273,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
@@ -3597,6 +3283,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl (%rsi), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
@@ -3606,52 +3297,43 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %r12d
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rbx, %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r14, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r11, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
@@ -3662,10 +3344,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 48(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 16(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
@@ -3676,11 +3359,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
@@ -3691,6 +3371,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
@@ -3700,60 +3385,39 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r11, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r9,%r9), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r12, %rdi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r10, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%rbx,%rbx), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r13,%r13), %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r12, %r15, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r12, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r12, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r13, 48(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbp, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 32(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 32(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r13
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: lshr_64bytes:
@@ -3762,40 +3426,44 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $208, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $204, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3806,8 +3474,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3816,214 +3483,199 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $7, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $63, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    notl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 128(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 132(%esp,%esi), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 136(%esp,%esi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    notl %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%esp) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%esi), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%edi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%edi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 140(%esp,%esi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 60(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 60(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 56(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 40(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $208, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $204, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -4036,209 +3688,153 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $204, %esp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esi), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $188, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    notl %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 112(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $60, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 116(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 120(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 124(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 128(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 132(%esp,%esi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 136(%esp,%esi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 56(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 60(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 48(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 40(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 32(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 52(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 44(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 56(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 60(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 48(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 52(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 40(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 44(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 36(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $188, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -4252,42 +3848,46 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $204, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -4297,6 +3897,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4307,163 +3908,141 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ecx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ecx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 132(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 128(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 76(%esp,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 136(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 40(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 32(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 52(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $204, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
@@ -4478,7 +4057,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $188, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4489,7 +4068,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
@@ -4499,7 +4078,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
@@ -4508,13 +4087,17 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
@@ -4522,9 +4105,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
@@ -4534,138 +4118,90 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %ecx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $60, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 112(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 120(%esp,%eax), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%ebp,%ebp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 116(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 128(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 124(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, (%esp) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 132(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 56(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 48(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 40(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 32(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 56(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 48(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 52(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 40(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 60(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 52(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 44(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 36(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 28(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 44(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 32(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 36(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 60(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $188, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -4680,7 +4216,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: shl_64bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
@@ -4695,6 +4230,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
@@ -4703,107 +4243,91 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $7, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    negl %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movslq %esi, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%r14), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%r14), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movslq %esi, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rbx), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rbx), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%r14), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rbx), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%r14), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rbx), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%r14), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%r14), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbp, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rbx), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -8(%rsp,%r14), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rbx), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r13, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%r14), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -8(%rsp,%rbx), %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rbx), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, 48(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbp, 56(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r12, 40(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 16(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 48(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r13, 56(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 32(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, 40(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r12
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
@@ -4815,7 +4339,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
@@ -4823,77 +4352,42 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    negl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movslq %esi, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%r10), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    notl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%r10), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%r10), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%r10), %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%r10), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r12, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%r10), %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%r10), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r13, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r15, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%r10), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r12, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r9, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 56(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 40(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbp, 48(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 32(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movslq %esi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%r9), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%r9), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%r9), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%r9), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rdi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%r9), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%r9), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r11, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%r9), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%r9), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rbx, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 56(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r13
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_64bytes:
@@ -4904,6 +4398,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
@@ -4913,6 +4408,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
@@ -4922,68 +4422,58 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    negl %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movslq %esi, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rcx), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rcx), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rcx), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rcx), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r15, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rcx), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r11, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r10, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %bpl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movslq %esi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r14, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r8, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r10, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %r13d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %r13b
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rbp, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r10, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rbp, %r15, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rcx), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rbx, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r8, %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, -8(%rsp,%rcx), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rcx), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rbp, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r12, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r8, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r9, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r14, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, -8(%rsp,%rsi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rbp, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r8, %r11, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r8, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r13, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r12, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 48(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 56(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 56(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 32(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 40(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
@@ -4994,12 +4484,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
@@ -5009,6 +4496,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
@@ -5018,65 +4510,40 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movslq %esi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %rsi, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %rdi, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%rax), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r10, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %ebp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %ebp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrq %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rbp, %r8, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax), %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rbp, %rbx, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r12, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%rax), %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r13, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rbp, %r12, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r11, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rdi, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r11, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r13, 40(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbp, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r12, 48(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 32(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movslq %esi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%r8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%r8), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%r8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rdi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%r8), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%r8), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r11, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r9, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%r8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r9, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rbx, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r10, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 56(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 32(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, 40(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r13
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: shl_64bytes:
@@ -5085,42 +4552,44 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $192, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ebx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%ebx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%ebx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $204, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5129,6 +4598,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5137,200 +4609,179 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl %ecx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $7, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %ch
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, (%esp) # 1-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    negl %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 176(%esp,%ecx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    negl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 176(%esp,%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%edi), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 56(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 48(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 52(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 56(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 60(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 52(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 40(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5353,7 +4804,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $192, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $204, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -5366,213 +4817,153 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $204, %esp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $188, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%ecx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%ecx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%ecx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%ecx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%ecx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    notl %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $60, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl %ebp, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    negl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 188(%esp,%esi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%edi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    negl %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 160(%esp,%ebp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 56(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 60(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%edi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 60(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 52(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 44(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 48(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 52(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 40(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 44(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 32(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 56(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 48(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 40(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 32(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $188, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -5585,50 +4976,55 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $216, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $204, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ebp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%ebp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%ebp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5641,179 +5037,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, (%esp), %ebx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, (%esp), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    negl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, 212(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%edi), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    negl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, 188(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 52(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 36(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 28(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 56(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 48(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 40(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 32(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $216, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $204, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -5827,43 +5194,43 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $204, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%ebx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%edi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%edi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%edi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%edi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%ebx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%ebx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%ebx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%ebx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5872,6 +5239,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5882,148 +5252,95 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl %edi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 188(%esp,%esi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 60(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %edi, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 52(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 44(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 56(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 48(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 40(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 32(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%edx)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $60, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl %ebx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 176(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 56(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 60(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 48(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 52(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 40(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 44(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 36(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $204, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
@@ -6045,6 +5362,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
@@ -6072,9 +5390,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $7, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %r8d
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%r8), %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%r8), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rsi
@@ -6082,7 +5400,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%r8), %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
@@ -6091,7 +5408,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
 ; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
@@ -6143,6 +5460,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
@@ -6153,22 +5471,19 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r12
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
@@ -6176,74 +5491,50 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq $63, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, (%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    notl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r11,%r11), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rdi), %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r14,%r14), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r12, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rdi), %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rdi), %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%rbp,%rbp), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r13, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r15, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r12, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rdi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbp, 48(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 56(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, 32(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r13
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_64bytes:
@@ -6254,6 +5545,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
@@ -6281,44 +5573,43 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %r12d
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rbx, %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r14, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r11, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
@@ -6329,10 +5620,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 48(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 16(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
@@ -6343,11 +5635,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
@@ -6376,52 +5665,39 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r11, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r9,%r9), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r12, %rdi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r10, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%rbx,%rbx), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r13,%r13), %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r12, %r15, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %r12, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r12, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r13, 48(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbp, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 32(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 32(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r13
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes:
@@ -6430,12 +5706,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $208, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $204, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
@@ -6443,7 +5719,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
@@ -6452,19 +5728,19 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -6473,7 +5749,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -6482,7 +5758,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
@@ -6503,196 +5779,195 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $7, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $63, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    notl %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%esp) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    notl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebp), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 128(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 132(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 136(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 140(%esp,%esi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 60(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 56(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 40(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $208, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $204, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -6705,7 +5980,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $188, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6718,7 +5993,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
@@ -6726,189 +6001,144 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    notl %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 112(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $60, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 116(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 120(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 124(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 128(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 132(%esp,%esi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 136(%esp,%esi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 56(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 60(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 48(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 40(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 32(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 52(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 44(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 56(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 60(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 48(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 52(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 40(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 44(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 36(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $188, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -6942,199 +6172,199 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 132(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 128(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 76(%esp,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 64(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 136(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %eax, %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %edx, %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 40(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 32(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 52(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $204, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
@@ -7149,7 +6379,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $188, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -7158,7 +6388,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
@@ -7170,173 +6400,142 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%edx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $60, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 112(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 120(%esp,%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 116(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 128(%esp,%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 124(%esp,%edx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, (%esp) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%edx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 132(%esp,%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 56(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 48(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 40(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 32(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 24(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 16(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 56(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 48(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 52(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 40(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 44(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 32(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 36(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 60(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 52(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 44(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 60(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $188, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index 9ae1f270e88337..044be12a395433 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -432,30 +432,89 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X86-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    movss %xmm0, (%esp)
-; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movzbl (%esp,%ecx), %ecx
-; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    subl $40, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $3, %dl
+; X86-SHLD-NEXT:    andb $12, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx), %ebx
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT:    movb %bl, (%eax)
+; X86-SHLD-NEXT:    addl $40, %esp
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -505,30 +564,89 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X86-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    movss %xmm0, (%esp)
-; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %ecx
-; X86-NEXT:    movw %cx, (%eax)
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $40, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $3, %dl
+; X86-SHLD-NEXT:    andb $12, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movw %si, (%eax)
+; X86-SHLD-NEXT:    addl $40, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -577,30 +695,89 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X86-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    movss %xmm0, (%esp)
-; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $40, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $3, %dl
+; X86-SHLD-NEXT:    andb $12, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movl %esi, (%eax)
+; X86-SHLD-NEXT:    addl $40, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -649,32 +826,128 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X86-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    movss %xmm0, (%esp)
-; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %edx
-; X86-NEXT:    movl 4(%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    pushl %edi
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $32, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $3, %dl
+; X86-SHLD-NEXT:    andb $12, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl 8(%esp,%edx), %esi
+; X86-SHLD-NEXT:    movl (%esp,%edx), %edi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    movl %edx, %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-SHLD-NEXT:    movl %ebx, 4(%eax)
+; X86-SHLD-NEXT:    movl %edi, (%eax)
+; X86-SHLD-NEXT:    addl $32, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    popl %edi
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -689,58 +962,123 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    shll $3, %esi
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrb $3, %sil
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    movzbl -64(%rsp,%rax), %eax
-; X64-NEXT:    movb %al, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movzbl (%esp,%ecx), %ecx
-; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT:    xorps %xmm1, %xmm1
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-NEXT:    shrb $6, %al
+; X64-NO-BMI2-NEXT:    movzbl %al, %eax
+; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-BMI2-NEXT:    xorps %xmm1, %xmm1
+; X64-BMI2-NEXT:    shll $3, %esi
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movl %esi, %eax
+; X64-BMI2-NEXT:    shrb $6, %al
+; X64-BMI2-NEXT:    movzbl %al, %eax
+; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rax
+; X64-BMI2-NEXT:    movb %al, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    subl $72, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $5, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx,4), %ebx
+; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT:    movb %bl, (%eax)
+; X86-SHLD-NEXT:    addl $72, %esp
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -756,58 +1094,136 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    shll $3, %esi
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrb $3, %sil
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    movq -64(%rsp,%rax), %rax
-; X64-NEXT:    movw %ax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %ecx
-; X86-NEXT:    movw %cx, (%eax)
-; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT:    xorps %xmm1, %xmm1
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-NEXT:    shrb $6, %al
+; X64-NO-BMI2-NEXT:    movzbl %al, %eax
+; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT:    addl %eax, %eax
+; X64-NO-BMI2-NEXT:    andb $56, %cl
+; X64-NO-BMI2-NEXT:    notb %cl
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NEXT:    orl %esi, %eax
+; X64-NO-BMI2-NEXT:    movw %ax, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-BMI2-NEXT:    xorps %xmm1, %xmm1
+; X64-BMI2-NEXT:    shll $3, %esi
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movl %esi, %eax
+; X64-BMI2-NEXT:    shrb $6, %al
+; X64-BMI2-NEXT:    movzbl %al, %eax
+; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT:    andb $56, %sil
+; X64-BMI2-NEXT:    notb %sil
+; X64-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT:    addl %eax, %eax
+; X64-BMI2-NEXT:    shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT:    orl %eax, %ecx
+; X64-BMI2-NEXT:    movw %cx, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $72, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $5, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movw %si, (%eax)
+; X86-SHLD-NEXT:    addl $72, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -822,58 +1238,136 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    shll $3, %esi
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrb $3, %sil
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    movl -64(%rsp,%rax), %eax
-; X64-NEXT:    movl %eax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT:    xorps %xmm1, %xmm1
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-NEXT:    shrb $6, %al
+; X64-NO-BMI2-NEXT:    movzbl %al, %eax
+; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT:    addl %eax, %eax
+; X64-NO-BMI2-NEXT:    andb $56, %cl
+; X64-NO-BMI2-NEXT:    notb %cl
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NEXT:    orl %esi, %eax
+; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-BMI2-NEXT:    xorps %xmm1, %xmm1
+; X64-BMI2-NEXT:    shll $3, %esi
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movl %esi, %eax
+; X64-BMI2-NEXT:    shrb $6, %al
+; X64-BMI2-NEXT:    movzbl %al, %eax
+; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT:    andb $56, %sil
+; X64-BMI2-NEXT:    notb %sil
+; X64-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT:    addl %eax, %eax
+; X64-BMI2-NEXT:    shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT:    orl %eax, %ecx
+; X64-BMI2-NEXT:    movl %ecx, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $72, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $5, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movl %esi, (%eax)
+; X86-SHLD-NEXT:    addl $72, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -888,60 +1382,191 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    shll $3, %esi
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrb $3, %sil
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    movq -64(%rsp,%rax), %rax
-; X64-NEXT:    movq %rax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %edx
-; X86-NEXT:    movl 4(%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rsi, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-SHLD:       # %bb.0:
+; X64-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X64-SHLD-NEXT:    leal (,%rsi,8), %ecx
+; X64-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movl %ecx, %eax
+; X64-SHLD-NEXT:    shrb $6, %al
+; X64-SHLD-NEXT:    movzbl %al, %eax
+; X64-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rax
+; X64-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-SHLD-NEXT:    shrdq %cl, %rax, %rsi
+; X64-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    pushl %edi
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $64, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $5, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl 8(%esp,%edx,4), %esi
+; X86-SHLD-NEXT:    movl (%esp,%edx,4), %edi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    movl %edx, %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-SHLD-NEXT:    movl %ebx, 4(%eax)
+; X86-SHLD-NEXT:    movl %edi, (%eax)
+; X86-SHLD-NEXT:    addl $64, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    popl %edi
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edx,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -956,70 +1581,288 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    shll $3, %esi
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrb $3, %sil
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    movq -64(%rsp,%rax), %rcx
-; X64-NEXT:    movq -56(%rsp,%rax), %rax
-; X64-NEXT:    movq %rax, 8(%rdx)
-; X64-NEXT:    movq %rcx, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %edx
-; X86-NEXT:    movl 4(%esp,%ecx), %esi
-; X86-NEXT:    movl 8(%esp,%ecx), %edi
-; X86-NEXT:    movl 12(%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rdi,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rdi,8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rdi,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r9, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $6, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %cl, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rsi,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rsi,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq %rsi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r9d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    addq %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r9, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r8, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%edi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edi,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebp
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    pushl %edi
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $92, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    shrb $5, %al
+; X86-SHLD-NEXT:    movzbl %al, %ebx
+; X86-SHLD-NEXT:    movl 24(%esp,%ebx,4), %esi
+; X86-SHLD-NEXT:    movl 16(%esp,%ebx,4), %eax
+; X86-SHLD-NEXT:    movl 20(%esp,%ebx,4), %edi
+; X86-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edi
+; X86-SHLD-NEXT:    movl 28(%esp,%ebx,4), %ebp
+; X86-SHLD-NEXT:    shrdl %cl, %ebp, %esi
+; X86-SHLD-NEXT:    movl 32(%esp,%ebx,4), %ebx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-SHLD-NEXT:    movl %ebp, 12(%edx)
+; X86-SHLD-NEXT:    movl %esi, 8(%edx)
+; X86-SHLD-NEXT:    movl %edi, 4(%edx)
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %eax
+; X86-SHLD-NEXT:    movl %eax, (%edx)
+; X86-SHLD-NEXT:    addl $92, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    popl %edi
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    popl %ebp
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 16(%esp,%ecx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ecx,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -1034,84 +1877,155 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
 }
 
 define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $63, %esi
-; X64-NEXT:    movzbl -128(%rsp,%rsi), %eax
-; X64-NEXT:    movb %al, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $128, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    movdqu 16(%edx), %xmm1
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    andl $63, %ecx
-; X86-NEXT:    movzbl (%esp,%ecx), %ecx
-; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    addl $128, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    pushq %rax
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    andl $56, %ecx
+; X64-NO-BMI2-NEXT:    andl $56, %esi
+; X64-NO-BMI2-NEXT:    movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
+; X64-NO-BMI2-NEXT:    addl %esi, %esi
+; X64-NO-BMI2-NEXT:    notl %ecx
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-NEXT:    orl %eax, %esi
+; X64-NO-BMI2-NEXT:    movb %sil, (%rdx)
+; X64-NO-BMI2-NEXT:    popq %rax
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    pushq %rax
+; X64-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    leal (,%rsi,8), %eax
+; X64-BMI2-NEXT:    andl $56, %eax
+; X64-BMI2-NEXT:    andl $56, %esi
+; X64-BMI2-NEXT:    shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-BMI2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT:    notl %eax
+; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
+; X64-BMI2-NEXT:    addl %esi, %esi
+; X64-BMI2-NEXT:    shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT:    orl %eax, %ecx
+; X64-BMI2-NEXT:    movb %cl, (%rdx)
+; X64-BMI2-NEXT:    popq %rax
+; X64-BMI2-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (,%edx,8), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    subl $136, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    leal (,%edx,8), %ecx
+; X86-SHLD-NEXT:    andl $60, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx), %ebx
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT:    movb %bl, (%eax)
+; X86-SHLD-NEXT:    addl $136, %esp
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1127,84 +2041,155 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $63, %esi
-; X64-NEXT:    movq -128(%rsp,%rsi), %rax
-; X64-NEXT:    movw %ax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $128, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    movdqu 16(%edx), %xmm1
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    andl $63, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %ecx
-; X86-NEXT:    movw %cx, (%eax)
-; X86-NEXT:    addl $128, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    pushq %rax
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    andl $56, %ecx
+; X64-NO-BMI2-NEXT:    andl $56, %esi
+; X64-NO-BMI2-NEXT:    movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
+; X64-NO-BMI2-NEXT:    addl %esi, %esi
+; X64-NO-BMI2-NEXT:    notl %ecx
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-NEXT:    orl %eax, %esi
+; X64-NO-BMI2-NEXT:    movw %si, (%rdx)
+; X64-NO-BMI2-NEXT:    popq %rax
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    pushq %rax
+; X64-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    leal (,%rsi,8), %eax
+; X64-BMI2-NEXT:    andl $56, %eax
+; X64-BMI2-NEXT:    andl $56, %esi
+; X64-BMI2-NEXT:    shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-BMI2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT:    notl %eax
+; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
+; X64-BMI2-NEXT:    addl %esi, %esi
+; X64-BMI2-NEXT:    shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT:    orl %eax, %ecx
+; X64-BMI2-NEXT:    movw %cx, (%rdx)
+; X64-BMI2-NEXT:    popq %rax
+; X64-BMI2-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (,%edx,8), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $136, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    leal (,%edx,8), %ecx
+; X86-SHLD-NEXT:    andl $60, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movw %si, (%eax)
+; X86-SHLD-NEXT:    addl $136, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1219,84 +2204,155 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $63, %esi
-; X64-NEXT:    movl -128(%rsp,%rsi), %eax
-; X64-NEXT:    movl %eax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $128, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    movdqu 16(%edx), %xmm1
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    andl $63, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    addl $128, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    pushq %rax
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    andl $56, %ecx
+; X64-NO-BMI2-NEXT:    andl $56, %esi
+; X64-NO-BMI2-NEXT:    movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
+; X64-NO-BMI2-NEXT:    addl %esi, %esi
+; X64-NO-BMI2-NEXT:    notl %ecx
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-NEXT:    orl %eax, %esi
+; X64-NO-BMI2-NEXT:    movl %esi, (%rdx)
+; X64-NO-BMI2-NEXT:    popq %rax
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    pushq %rax
+; X64-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    leal (,%rsi,8), %eax
+; X64-BMI2-NEXT:    andl $56, %eax
+; X64-BMI2-NEXT:    andl $56, %esi
+; X64-BMI2-NEXT:    shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-BMI2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT:    notl %eax
+; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
+; X64-BMI2-NEXT:    addl %esi, %esi
+; X64-BMI2-NEXT:    shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT:    orl %eax, %ecx
+; X64-BMI2-NEXT:    movl %ecx, (%rdx)
+; X64-BMI2-NEXT:    popq %rax
+; X64-BMI2-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (,%edx,8), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $136, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    leal (,%edx,8), %ecx
+; X86-SHLD-NEXT:    andl $60, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movl %esi, (%eax)
+; X86-SHLD-NEXT:    addl $136, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1311,86 +2367,216 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $63, %esi
-; X64-NEXT:    movq -128(%rsp,%rsi), %rax
-; X64-NEXT:    movq %rax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $128, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    movdqu 16(%edx), %xmm1
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    andl $63, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %edx
-; X86-NEXT:    movl 4(%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $128, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-SHLD:       # %bb.0:
+; X64-SHLD-NEXT:    pushq %rax
+; X64-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    leal (,%rsi,8), %ecx
+; X64-SHLD-NEXT:    andl $56, %esi
+; X64-SHLD-NEXT:    movq -128(%rsp,%rsi), %rax
+; X64-SHLD-NEXT:    movq -120(%rsp,%rsi), %rsi
+; X64-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-SHLD-NEXT:    popq %rax
+; X64-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $140, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $24, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $140, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    pushl %edi
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $128, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %esi
+; X86-SHLD-NEXT:    andl $60, %esi
+; X86-SHLD-NEXT:    movl 8(%esp,%esi), %edi
+; X86-SHLD-NEXT:    movl (%esp,%esi), %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%esi), %esi
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    andl $24, %ecx
+; X86-SHLD-NEXT:    movl %esi, %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    movl %ebx, 4(%eax)
+; X86-SHLD-NEXT:    movl %edx, (%eax)
+; X86-SHLD-NEXT:    addl $128, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    popl %edi
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $128, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $dl killed $dl killed $edx def $edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $128, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1405,96 +2591,326 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 }
 
 define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $63, %esi
-; X64-NEXT:    movq -128(%rsp,%rsi), %rax
-; X64-NEXT:    movq -120(%rsp,%rsi), %rcx
-; X64-NEXT:    movq %rcx, 8(%rdx)
-; X64-NEXT:    movq %rax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $128, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    movdqu 16(%edx), %xmm1
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    andl $63, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %edx
-; X86-NEXT:    movl 4(%esp,%ecx), %esi
-; X86-NEXT:    movl 8(%esp,%ecx), %edi
-; X86-NEXT:    movl 12(%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $128, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%rsi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    notl %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r9, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rsi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    notl %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rsi), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq %rsi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r10, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx def $rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r8, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    addq %rdi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rax, %rdi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rax, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $156, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $24, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $156, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebp
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    pushl %edi
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $156, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edi
+; X86-SHLD-NEXT:    andl $60, %edi
+; X86-SHLD-NEXT:    movl 24(%esp,%edi), %esi
+; X86-SHLD-NEXT:    movl 16(%esp,%edi), %eax
+; X86-SHLD-NEXT:    movl 20(%esp,%edi), %ebx
+; X86-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    andl $24, %ecx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT:    movl 28(%esp,%edi), %ebp
+; X86-SHLD-NEXT:    shrdl %cl, %ebp, %esi
+; X86-SHLD-NEXT:    movl 32(%esp,%edi), %edi
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    shrdl %cl, %edi, %ebp
+; X86-SHLD-NEXT:    movl %ebp, 12(%edx)
+; X86-SHLD-NEXT:    movl %esi, 8(%edx)
+; X86-SHLD-NEXT:    movl %ebx, 4(%edx)
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %eax
+; X86-SHLD-NEXT:    movl %eax, (%edx)
+; X86-SHLD-NEXT:    addl $156, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    popl %edi
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    popl %ebp
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%eax,8), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 16(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1509,116 +2925,484 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 }
 
 define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $63, %esi
-; X64-NEXT:    movq -128(%rsp,%rsi), %rax
-; X64-NEXT:    movq -120(%rsp,%rsi), %rcx
-; X64-NEXT:    movq -112(%rsp,%rsi), %rdi
-; X64-NEXT:    movq -104(%rsp,%rsi), %rsi
-; X64-NEXT:    movq %rsi, 24(%rdx)
-; X64-NEXT:    movq %rdi, 16(%rdx)
-; X64-NEXT:    movq %rcx, 8(%rdx)
-; X64-NEXT:    movq %rax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $136, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movdqu (%ecx), %xmm0
-; X86-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    andl $63, %eax
-; X86-NEXT:    movl 8(%esp,%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%esp,%eax), %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl 16(%esp,%eax), %esi
-; X86-NEXT:    movl 20(%esp,%eax), %edi
-; X86-NEXT:    movl 24(%esp,%eax), %ebx
-; X86-NEXT:    movl 28(%esp,%eax), %ebp
-; X86-NEXT:    movl 32(%esp,%eax), %edx
-; X86-NEXT:    movl 36(%esp,%eax), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, 28(%eax)
-; X86-NEXT:    movl %edx, 24(%eax)
-; X86-NEXT:    movl %ebp, 20(%eax)
-; X86-NEXT:    movl %ebx, 16(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    addl $136, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%rsi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %r8b
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r11,%r11), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    notl %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rsi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r11,%r11), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rsi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    notl %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rsi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r11,%r11), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r10, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rsi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rsi), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq %rsi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r14, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rsi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx def $rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r10,%r10), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rax, %r11, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r9, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r9, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    addq %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $172, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $24, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $172, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebp
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    pushl %edi
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $156, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edi
+; X86-SHLD-NEXT:    andl $60, %edi
+; X86-SHLD-NEXT:    movl 24(%esp,%edi), %edx
+; X86-SHLD-NEXT:    movl 20(%esp,%edi), %eax
+; X86-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    andl $24, %ecx
+; X86-SHLD-NEXT:    movl %eax, %esi
+; X86-SHLD-NEXT:    movl %edx, %eax
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT:    movl 28(%esp,%edi), %edx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X86-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT:    movl 32(%esp,%edi), %ebp
+; X86-SHLD-NEXT:    shrdl %cl, %ebp, %edx
+; X86-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-SHLD-NEXT:    movl 36(%esp,%edi), %esi
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebp
+; X86-SHLD-NEXT:    movl 40(%esp,%edi), %edx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movl 44(%esp,%edi), %eax
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-SHLD-NEXT:    movl 16(%esp,%edi), %ebx
+; X86-SHLD-NEXT:    movl 48(%esp,%edi), %edi
+; X86-SHLD-NEXT:    shrdl %cl, %edi, %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SHLD-NEXT:    movl %eax, 28(%edi)
+; X86-SHLD-NEXT:    movl %edx, 24(%edi)
+; X86-SHLD-NEXT:    movl %esi, 20(%edi)
+; X86-SHLD-NEXT:    movl %ebp, 16(%edi)
+; X86-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-SHLD-NEXT:    movl %eax, 12(%edi)
+; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SHLD-NEXT:    movl %eax, 8(%edi)
+; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SHLD-NEXT:    movl %eax, 4(%edi)
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %ebx
+; X86-SHLD-NEXT:    movl %ebx, (%edi)
+; X86-SHLD-NEXT:    addl $156, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    popl %edi
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    popl %ebp
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%eax,8), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 16(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 24(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1633,9 +3417,9 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
-; X64-HAVE-BMI2-HAVE-SHLD: {{.*}}
-; X64-NO-BMI2-HAVE-SHLD: {{.*}}
+; X64: {{.*}}
 ; X64-NO-SHLD: {{.*}}
+; X86: {{.*}}
 ; X86-HAVE-BMI2-HAVE-SHLD: {{.*}}
 ; X86-NO-BMI2-HAVE-SHLD: {{.*}}
 ; X86-NO-SHLD: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index 4a47e7613dfa6d..ff13f4ba577f2e 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -603,32 +603,86 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %sil, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
-; X86-LABEL: load_1byte_chunk_of_16byte_alloca:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movzbl (%esp,%ecx), %ecx
-; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    subl $40, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $3, %dl
+; X86-SHLD-NEXT:    andb $12, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx), %ebx
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT:    movb %bl, (%eax)
+; X86-SHLD-NEXT:    addl $40, %esp
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <16 x i8> %init
@@ -711,32 +765,86 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %si, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
-; X86-LABEL: load_2byte_chunk_of_16byte_alloca:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %ecx
-; X86-NEXT:    movw %cx, (%eax)
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $40, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $3, %dl
+; X86-SHLD-NEXT:    andb $12, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movw %si, (%eax)
+; X86-SHLD-NEXT:    addl $40, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <16 x i8> %init
@@ -818,32 +926,86 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
-; X86-LABEL: load_4byte_chunk_of_16byte_alloca:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $40, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $3, %dl
+; X86-SHLD-NEXT:    andb $12, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movl %esi, (%eax)
+; X86-SHLD-NEXT:    addl $40, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <16 x i8> %init
@@ -925,34 +1087,125 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
-; X86-LABEL: load_8byte_chunk_of_16byte_alloca:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %edx
-; X86-NEXT:    movl 4(%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    pushl %edi
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $32, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $3, %dl
+; X86-SHLD-NEXT:    andb $12, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl 8(%esp,%edx), %esi
+; X86-SHLD-NEXT:    movl (%esp,%edx), %edi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    movl %edx, %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-SHLD-NEXT:    movl %ebx, 4(%eax)
+; X86-SHLD-NEXT:    movl %edi, (%eax)
+; X86-SHLD-NEXT:    addl $32, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    popl %edi
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <16 x i8> %init
@@ -967,64 +1220,128 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; no @load_16byte_chunk_of_16byte_alloca
 
 define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    shll $3, %esi
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrb $3, %sil
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    movzbl -64(%rsp,%rax), %eax
-; X64-NEXT:    movb %al, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    movdqu 16(%edx), %xmm1
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movzbl (%esp,%ecx), %ecx
-; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-NEXT:    shrb $6, %al
+; X64-NO-BMI2-NEXT:    movzbl %al, %eax
+; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT:    shll $3, %esi
+; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movl %esi, %eax
+; X64-BMI2-NEXT:    shrb $6, %al
+; X64-BMI2-NEXT:    movzbl %al, %eax
+; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rax
+; X64-BMI2-NEXT:    movb %al, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    subl $72, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $5, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx,4), %ebx
+; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT:    movb %bl, (%eax)
+; X86-SHLD-NEXT:    addl $72, %esp
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <32 x i8> %init
@@ -1038,64 +1355,141 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 }
 
 define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_2byte_chunk_of_32byte_alloca:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    shll $3, %esi
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrb $3, %sil
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    movq -64(%rsp,%rax), %rax
-; X64-NEXT:    movw %ax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_2byte_chunk_of_32byte_alloca:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    movdqu 16(%edx), %xmm1
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %ecx
-; X86-NEXT:    movw %cx, (%eax)
-; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-NEXT:    shrb $6, %al
+; X64-NO-BMI2-NEXT:    movzbl %al, %eax
+; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT:    addl %eax, %eax
+; X64-NO-BMI2-NEXT:    andb $56, %cl
+; X64-NO-BMI2-NEXT:    notb %cl
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NEXT:    orl %esi, %eax
+; X64-NO-BMI2-NEXT:    movw %ax, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT:    shll $3, %esi
+; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movl %esi, %eax
+; X64-BMI2-NEXT:    shrb $6, %al
+; X64-BMI2-NEXT:    movzbl %al, %eax
+; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT:    andb $56, %sil
+; X64-BMI2-NEXT:    notb %sil
+; X64-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT:    addl %eax, %eax
+; X64-BMI2-NEXT:    shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT:    orl %eax, %ecx
+; X64-BMI2-NEXT:    movw %cx, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $72, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $5, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movw %si, (%eax)
+; X86-SHLD-NEXT:    addl $72, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <32 x i8> %init
@@ -1108,64 +1502,141 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 }
 
 define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_4byte_chunk_of_32byte_alloca:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    shll $3, %esi
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrb $3, %sil
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    movl -64(%rsp,%rax), %eax
-; X64-NEXT:    movl %eax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_4byte_chunk_of_32byte_alloca:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    movdqu 16(%edx), %xmm1
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-NEXT:    shrb $6, %al
+; X64-NO-BMI2-NEXT:    movzbl %al, %eax
+; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT:    addl %eax, %eax
+; X64-NO-BMI2-NEXT:    andb $56, %cl
+; X64-NO-BMI2-NEXT:    notb %cl
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NEXT:    orl %esi, %eax
+; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movups (%rdi), %xmm0
+; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT:    shll $3, %esi
+; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movl %esi, %eax
+; X64-BMI2-NEXT:    shrb $6, %al
+; X64-BMI2-NEXT:    movzbl %al, %eax
+; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT:    andb $56, %sil
+; X64-BMI2-NEXT:    notb %sil
+; X64-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT:    addl %eax, %eax
+; X64-BMI2-NEXT:    shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT:    orl %eax, %ecx
+; X64-BMI2-NEXT:    movl %ecx, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $72, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $5, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movl %esi, (%eax)
+; X86-SHLD-NEXT:    addl $72, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <32 x i8> %init
@@ -1178,66 +1649,197 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 }
 
 define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_8byte_chunk_of_32byte_alloca:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    shll $3, %esi
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrb $3, %sil
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    movq -64(%rsp,%rax), %rax
-; X64-NEXT:    movq %rax, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_8byte_chunk_of_32byte_alloca:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    movdqu 16(%edx), %xmm1
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %edx
-; X86-NEXT:    movl 4(%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rsi, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X64-SHLD:       # %bb.0:
+; X64-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-SHLD-NEXT:    leal (,%rsi,8), %ecx
+; X64-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movl %ecx, %eax
+; X64-SHLD-NEXT:    shrb $6, %al
+; X64-SHLD-NEXT:    movzbl %al, %eax
+; X64-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rax
+; X64-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-SHLD-NEXT:    shrdq %cl, %rax, %rsi
+; X64-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    pushl %edi
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $64, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    shrb $5, %dl
+; X86-SHLD-NEXT:    movzbl %dl, %edx
+; X86-SHLD-NEXT:    movl 8(%esp,%edx,4), %esi
+; X86-SHLD-NEXT:    movl (%esp,%edx,4), %edi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    movl %edx, %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-SHLD-NEXT:    movl %ebx, 4(%eax)
+; X86-SHLD-NEXT:    movl %edi, (%eax)
+; X86-SHLD-NEXT:    addl $64, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    popl %edi
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edx,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <32 x i8> %init
@@ -1250,76 +1852,295 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 }
 
 define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_16byte_chunk_of_32byte_alloca:
-; X64:       # %bb.0:
-; X64-NEXT:    movdqu (%rdi), %xmm0
-; X64-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-NEXT:    shll $3, %esi
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrb $3, %sil
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    movq -64(%rsp,%rax), %rcx
-; X64-NEXT:    movq -56(%rsp,%rax), %rax
-; X64-NEXT:    movq %rax, 8(%rdx)
-; X64-NEXT:    movq %rcx, (%rdx)
-; X64-NEXT:    retq
-;
-; X86-LABEL: load_16byte_chunk_of_32byte_alloca:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
-; X86-NEXT:    movdqu 16(%edx), %xmm1
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrb $3, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %edx
-; X86-NEXT:    movl 4(%esp,%ecx), %esi
-; X86-NEXT:    movl 8(%esp,%ecx), %edi
-; X86-NEXT:    movl 12(%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rdi,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rdi,8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rdi,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r9, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $6, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %cl, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rsi,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rsi,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq %rsi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r9d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    addq %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r9, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r8, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%edi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edi,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X86-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %ebp
+; X86-SHLD-NEXT:    pushl %ebx
+; X86-SHLD-NEXT:    pushl %edi
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    subl $92, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    shrb $5, %al
+; X86-SHLD-NEXT:    movzbl %al, %ebx
+; X86-SHLD-NEXT:    movl 24(%esp,%ebx,4), %esi
+; X86-SHLD-NEXT:    movl 16(%esp,%ebx,4), %eax
+; X86-SHLD-NEXT:    movl 20(%esp,%ebx,4), %edi
+; X86-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edi
+; X86-SHLD-NEXT:    movl 28(%esp,%ebx,4), %ebp
+; X86-SHLD-NEXT:    shrdl %cl, %ebp, %esi
+; X86-SHLD-NEXT:    movl 32(%esp,%ebx,4), %ebx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-SHLD-NEXT:    movl %ebp, 12(%edx)
+; X86-SHLD-NEXT:    movl %esi, 8(%edx)
+; X86-SHLD-NEXT:    movl %edi, 4(%edx)
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %eax
+; X86-SHLD-NEXT:    movl %eax, (%edx)
+; X86-SHLD-NEXT:    addl $92, %esp
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    popl %edi
+; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    popl %ebp
+; X86-SHLD-NEXT:    retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 16(%esp,%ecx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ecx,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
   %intermediate.val.frozen = freeze <32 x i8> %init
@@ -1334,7 +2155,7 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ; no @load_32byte_chunk_of_32byte_alloca
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
+; X64: {{.*}}
 ; X64-NO-SHLD: {{.*}}
-; X64-SHLD: {{.*}}
+; X86: {{.*}}
 ; X86-NO-SHLD: {{.*}}
-; X86-SHLD: {{.*}}


        


More information about the llvm-commits mailing list